Author : Eze Anthony Anayo
Student ID: 22536729
Car Advert Dataset Price Analysis and prediction: I will be working with a dataset provided by AutoTrader, which contains anonymized car sale adverts with information on various features such as brand, type, color, mileage, and selling price. My task is to perform a structured set of tasks to uncover interesting associations and group differences that have a significant impact on the valuation of vehicles. I am looking forward to using my knowledge and skills in data understanding, exploration, preparation, and hypothesis testing to uncover valuable insights from this dataset. I am eager to dive into the world of data science and see what insights I can uncover from this dataset.
00 - public_reference:
01 - mileage:
02 - reg_code:
03 - standard_colour:
04 - standard_make:
05 - standard_model:
06 - vehicle_condition:
07 - year_of_registration:
08 - price:
09 - body_type:
10 - crossover_car_and_van:
11 - fuel_type:
#import all packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
%config InlineBackend.figure_format = 'retina'
# https://seaborn.pydata.org/tutorial/aesthetics.html
sns.set(
style='ticks',
context='talk',
font_scale=0.8,
rc={'figure.figsize': (8,6)}
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error
from functools import partial
rmse = partial(mean_squared_error, squared=False)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.dummy import DummyRegressor
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
import shap
shap.initjs()
import warnings
# ignore all warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('adverts.csv')
# Lets confirm we have our data in the notebook
adv = df.copy()
adv.head()
| public_reference | mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202006039777689 | 0.0 | NaN | Grey | Volvo | XC90 | NEW | NaN | 73970 | SUV | False | Petrol Plug-in Hybrid |
| 1 | 202007020778260 | 108230.0 | 61 | Blue | Jaguar | XF | USED | 2011.0 | 7000 | Saloon | False | Diesel |
| 2 | 202007020778474 | 7800.0 | 17 | Grey | SKODA | Yeti | USED | 2017.0 | 14000 | SUV | False | Petrol |
| 3 | 202007080986776 | 45000.0 | 16 | Brown | Vauxhall | Mokka | USED | 2016.0 | 7995 | Hatchback | False | Diesel |
| 4 | 202007161321269 | 64000.0 | 64 | Grey | Land Rover | Range Rover Sport | USED | 2015.0 | 26995 | SUV | False | Diesel |
adv.shape
(402005, 12)
This means there are 402005 observations (e.g. car adverts) in the dataframe and each observation has 12 features (e.g. make, model, year, etc.). This information can be useful for understanding the size and structure of the dataset and can inform further analysis.
adv.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 402005 entries, 0 to 402004 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 public_reference 402005 non-null int64 1 mileage 401878 non-null float64 2 reg_code 370148 non-null object 3 standard_colour 396627 non-null object 4 standard_make 402005 non-null object 5 standard_model 402005 non-null object 6 vehicle_condition 402005 non-null object 7 year_of_registration 368694 non-null float64 8 price 402005 non-null int64 9 body_type 401168 non-null object 10 crossover_car_and_van 402005 non-null bool 11 fuel_type 401404 non-null object dtypes: bool(1), float64(2), int64(2), object(7) memory usage: 34.1+ MB
adv.columns
Index(['public_reference', 'mileage', 'reg_code', 'standard_colour',
'standard_make', 'standard_model', 'vehicle_condition',
'year_of_registration', 'price', 'body_type', 'crossover_car_and_van',
'fuel_type'],
dtype='object')
adv.groupby('standard_make')['price'].describe().sort_values(by='count', ascending=False)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| standard_make | ||||||||
| BMW | 37376.0 | 1.991658e+04 | 14416.711836 | 495.0 | 10988.0 | 16690.0 | 24500.00 | 199995.0 |
| Audi | 35280.0 | 2.042114e+04 | 15593.255628 | 250.0 | 11383.0 | 16500.0 | 24000.00 | 293635.0 |
| Volkswagen | 34246.0 | 1.401157e+04 | 8421.031840 | 240.0 | 7790.0 | 12700.0 | 18913.75 | 80000.0 |
| Vauxhall | 33700.0 | 8.178180e+03 | 4861.187979 | 122.0 | 4795.0 | 7650.0 | 10650.00 | 47200.0 |
| Mercedes-Benz | 31917.0 | 2.159149e+04 | 18527.737075 | 325.0 | 12685.0 | 18300.0 | 26000.00 | 1000000.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Pagani | 1.0 | 2.400000e+06 | NaN | 2400000.0 | 2400000.0 | 2400000.0 | 2400000.00 | 2400000.0 |
| Panther | 1.0 | 1.295000e+04 | NaN | 12950.0 | 12950.0 | 12950.0 | 12950.00 | 12950.0 |
| Reliant | 1.0 | 3.999000e+03 | NaN | 3999.0 | 3999.0 | 3999.0 | 3999.00 | 3999.0 |
| Radical | 1.0 | 1.259500e+05 | NaN | 125950.0 | 125950.0 | 125950.0 | 125950.00 | 125950.0 |
| International | 1.0 | 2.499500e+04 | NaN | 24995.0 | 24995.0 | 24995.0 | 24995.00 | 24995.0 |
110 rows × 8 columns
adv.groupby('fuel_type')['price'].describe().sort_values(by='count', ascending=False)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| fuel_type | ||||||||
| Petrol | 216929.0 | 16620.439950 | 59614.274059 | 120.0 | 6495.00 | 11000.0 | 18042.0 | 9999999.0 |
| Diesel | 158120.0 | 16505.048387 | 13040.390986 | 200.0 | 8250.00 | 13495.0 | 20495.0 | 1000000.0 |
| Petrol Hybrid | 13602.0 | 20289.115351 | 27467.195413 | 750.0 | 12499.00 | 17814.0 | 24355.0 | 2049950.0 |
| Petrol Plug-in Hybrid | 6160.0 | 35986.686851 | 60476.627361 | 5750.0 | 19288.75 | 30995.0 | 43895.0 | 2695000.0 |
| Electric | 4783.0 | 32752.126072 | 21558.832744 | 4250.0 | 18955.00 | 27894.0 | 36500.0 | 174995.0 |
| Diesel Hybrid | 1403.0 | 40469.160371 | 15884.586207 | 4980.0 | 31990.00 | 39990.0 | 51530.5 | 82175.0 |
| Bi Fuel | 221.0 | 14630.524887 | 8978.030401 | 600.0 | 11945.00 | 14000.0 | 15421.0 | 69995.0 |
| Diesel Plug-in Hybrid | 185.0 | 35633.691892 | 10341.838746 | 9850.0 | 32444.00 | 35991.0 | 39890.0 | 67035.0 |
| Natural Gas | 1.0 | 3795.000000 | NaN | 3795.0 | 3795.00 | 3795.0 | 3795.0 | 3795.0 |
(e.g., detect and deal with noise (i.e., erroneous values), missing values, and outliers; categorically-encode, rescale data; split data into predictors and target; obtain train/validation/test folds).
# Loading dataframe and Performing general operations to inspect data
# types and look for instances of missing or possibly errant data.
# Lets plot the historian data for the no show appointments
adv.head(10)
| public_reference | mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 202006039777689 | 0.0 | NaN | Grey | Volvo | XC90 | NEW | NaN | 73970 | SUV | False | Petrol Plug-in Hybrid |
| 1 | 202007020778260 | 108230.0 | 61 | Blue | Jaguar | XF | USED | 2011.0 | 7000 | Saloon | False | Diesel |
| 2 | 202007020778474 | 7800.0 | 17 | Grey | SKODA | Yeti | USED | 2017.0 | 14000 | SUV | False | Petrol |
| 3 | 202007080986776 | 45000.0 | 16 | Brown | Vauxhall | Mokka | USED | 2016.0 | 7995 | Hatchback | False | Diesel |
| 4 | 202007161321269 | 64000.0 | 64 | Grey | Land Rover | Range Rover Sport | USED | 2015.0 | 26995 | SUV | False | Diesel |
| 5 | 202009304412074 | 16000.0 | 17 | Blue | Audi | S5 | USED | 2017.0 | 29000 | Convertible | False | Petrol |
| 6 | 202007080998445 | 24075.0 | 17 | Red | Vauxhall | Viva | USED | 2017.0 | 5861 | Hatchback | False | Petrol |
| 7 | 202009244143980 | 99000.0 | 13 | Bronze | Land Rover | Range Rover | USED | 2013.0 | 24475 | SUV | False | Diesel |
| 8 | 202010014442611 | 111236.0 | 08 | Black | Mercedes-Benz | S Class | USED | 2008.0 | 6995 | Limousine | False | Diesel |
| 9 | 202006230431327 | 9500.0 | 19 | White | Land Rover | Range Rover | USED | 2019.0 | 79995 | SUV | False | Diesel |
Observing the dataframe, the public reference can be made into the DataFrame's (row) index.But lets check the number of unique values.
#checking the number pf unique values
adv['public_reference'].nunique()
402005
We will have to drop the public reference column as it has 402005 unique values hence it does not help our analysis
#lets check the datatypes of all the features
adv.dtypes
public_reference int64 mileage float64 reg_code object standard_colour object standard_make object standard_model object vehicle_condition object year_of_registration float64 price int64 body_type object crossover_car_and_van bool fuel_type object dtype: object
As we can see from the observation of the datatypes, The following features are:
Analysing quantitative features in the dataset, lets take a subset of the adv, which will have the quantitative features(mileage, year_of_registration and price)
#creating a list of quantitative features
quantitative_features = ['mileage', 'year_of_registration', 'price']
#creating a dataframe of the quantitative features
quantitative_df = adv[quantitative_features]
quantitative_df.head(5)
| mileage | year_of_registration | price | |
|---|---|---|---|
| 0 | 0.0 | NaN | 73970 |
| 1 | 108230.0 | 2011.0 | 7000 |
| 2 | 7800.0 | 2017.0 | 14000 |
| 3 | 45000.0 | 2016.0 | 7995 |
| 4 | 64000.0 | 2015.0 | 26995 |
quantitative_df.describe(include='all')
| mileage | year_of_registration | price | |
|---|---|---|---|
| count | 401878.000000 | 368694.000000 | 4.020050e+05 |
| mean | 37743.595656 | 2015.006206 | 1.734197e+04 |
| std | 34831.724018 | 7.962667 | 4.643746e+04 |
| min | 0.000000 | 999.000000 | 1.200000e+02 |
| 25% | 10481.000000 | 2013.000000 | 7.495000e+03 |
| 50% | 28629.500000 | 2016.000000 | 1.260000e+04 |
| 75% | 56875.750000 | 2018.000000 | 2.000000e+04 |
| max | 999999.000000 | 2020.000000 | 9.999999e+06 |
There seems to be an error in year of registration as the minimum value is 999
#checking the skewness value
quantitative_df.skew()
mileage 1.451132 year_of_registration -87.909954 price 154.681527 dtype: float64
mileage - the skewness value of 1.451132 for the mileage column suggests that the distribution of mileage values is skewed to the right, meaning that there are more values on the higher end of the range and fewer on the lower end.
year of registration - The negative skewness value of -87.909954 for the year_of_registration column suggests that the distribution is skewed to the left, meaning that there are more values on the lower end of the range (older cars) and fewer on the higher end (newer cars). This skewness value indicates that there are a large number of older cars in the dataset, and relatively fewer newer cars. This might indicate that the dataset has a large number of used cars, and relatively fewer new cars.
price - The skewness value of 154.681527 for the price column suggests that the distribution is heavily skewed to the right, meaning that there are a large number of lower-priced cars and a relatively small number of higher-priced cars.
This can be further confirmed by checking the Boxplots of the quantitative features below
#Creating a function to plot
def boxplotter (column, x_label, y_label, title):
if column == 'year_of_registration':
column_log10 = quantitative_df[column]
# create a box plot of the price_log10 variable
sns.boxplot(x = column_log10)
#add x-axis label
plt.xlabel(x_label)
#add y-axis label
plt.ylabel(y_label)
#add title
plt.title(title)
else:
column_log10 = np.log10(quantitative_df[column])
# create a box plot of the price_log10 variable
sns.boxplot(x = column_log10)
#add x-axis label
plt.xlabel(x_label)
#add y-axis label
plt.ylabel(y_label)
#add title
plt.title(title)
return
boxplotter('price', "log10(price)", "Count", "Distribution of log10(price)")
boxplotter('mileage', "log10(mileage)", "Count", "Distribution of log10(mileage)")
boxplotter('year_of_registration', "log10(year of registration)", "Count", "Distribution of log10(year of registration)")
sns.heatmap(quantitative_df.corr(), annot=True)
plt.show()
sns.pairplot(quantitative_df)
plt.show()
observing the relationship from an initial glance of the pairplot, we can see that
price vs mileage: The higher price, the lower the mileage of the car
price vs year_of_registration: The higher the price, the higher the year_of_registration
#creating a list of categorical features
cat_features = ['reg_code',
'standard_colour',
'standard_make',
'standard_model',
'vehicle_condition',
'body_type',
'crossover_car_and_van',
'fuel_type'
]
#creating a dataframe of Categorical features
cat_df = adv[cat_features]
cat_df.head(5)
| reg_code | standard_colour | standard_make | standard_model | vehicle_condition | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|
| 0 | NaN | Grey | Volvo | XC90 | NEW | SUV | False | Petrol Plug-in Hybrid |
| 1 | 61 | Blue | Jaguar | XF | USED | Saloon | False | Diesel |
| 2 | 17 | Grey | SKODA | Yeti | USED | SUV | False | Petrol |
| 3 | 16 | Brown | Vauxhall | Mokka | USED | Hatchback | False | Diesel |
| 4 | 64 | Grey | Land Rover | Range Rover Sport | USED | SUV | False | Diesel |
#There are 8 categorical features
cat_df.dtypes
reg_code object standard_colour object standard_make object standard_model object vehicle_condition object body_type object crossover_car_and_van bool fuel_type object dtype: object
Creating a funtion for getting the value count of the Data in a Categorical feature
def value_counts(dataframe,column):
'''
This function returns the value_counts of the specified column of a dataframe
Parameters:
dataframe : dataframe (pandas dataframe)
column : str (column name on which value_counts is to be applied)
Returns:
value_counts : pandas series
'''
# value_counts of the column passed in the function argument
value_counts = dataframe[column].value_counts()
return value_counts
Creating a funtion for plotting the Data in a Categorical feature
def countplot(dataframe, column, title, xlabel, ylabel):
# sort the column in descending order
sorted_column = dataframe[column].value_counts().sort_values(ascending=False).index
# create a countplot using seaborn countplot method and y-axis as the sorted_column
ax = sns.countplot(y=dataframe[column], order=sorted_column)
# set the title of the plot as the title passed in the function argument
ax.set_title(title)
# set the x-axis label as the xlabel passed in the function argument
ax.set_xlabel(xlabel)
# set the y-axis label as the ylabel passed in the function argument
ax.set_ylabel(ylabel)
# show the final plot
plt.show()
#Value_counts in 'fuel_type'
value_counts(cat_df,'fuel_type')
fuel_type Petrol 216929 Diesel 158120 Petrol Hybrid 13602 Petrol Plug-in Hybrid 6160 Electric 4783 Diesel Hybrid 1403 Bi Fuel 221 Diesel Plug-in Hybrid 185 Natural Gas 1 Name: count, dtype: int64
Observing that in the fuel_type column and Petrol, Diesel are the most fuel type that vehicle use
#countplot of 'Vehicle_condition'
countplot(cat_df, 'fuel_type',
'Plot of fuel_type Count',
'Count of fuel type',
'fuel type'
)
#proportions of fuel type
fuel_type_prop = cat_df['fuel_type'].value_counts() / cat_df['fuel_type'].count()
fuel_type_prop
fuel_type Petrol 0.540426 Diesel 0.393917 Petrol Hybrid 0.033886 Petrol Plug-in Hybrid 0.015346 Electric 0.011916 Diesel Hybrid 0.003495 Bi Fuel 0.000551 Diesel Plug-in Hybrid 0.000461 Natural Gas 0.000002 Name: count, dtype: float64
#Value_counts in 'fuel_type'
value_counts(cat_df,'body_type')
body_type Hatchback 167315 SUV 115872 Saloon 36641 Estate 24692 Coupe 23258 Convertible 16038 MPV 16026 Pickup 620 Combi Van 214 Limousine 159 Minibus 149 Camper 77 Panel Van 61 Window Van 41 Chassis Cab 3 Car Derived Van 2 Name: count, dtype: int64
Hatchback, SUV, Saloon, and Estate are the most popular body type of vehicles respectively
#countplot of 'Vehicle_condition'
countplot(cat_df, 'body_type',
'Plot of body_type Count',
'Count of body type',
'body type'
)
#proportions of fuel type
body_type_prop= cat_df['body_type'].value_counts() / cat_df['body_type'].count()
body_type_prop
body_type Hatchback 0.417070 SUV 0.288837 Saloon 0.091336 Estate 0.061550 Coupe 0.057976 Convertible 0.039978 MPV 0.039948 Pickup 0.001545 Combi Van 0.000533 Limousine 0.000396 Minibus 0.000371 Camper 0.000192 Panel Van 0.000152 Window Van 0.000102 Chassis Cab 0.000007 Car Derived Van 0.000005 Name: count, dtype: float64
Browsing through the dataframe we can see some important columns like the mileage', 'standard_colour', 'standard_make','standard_model', 'vehicle_condition', 'year_of_registration', 'price', 'crossover_car_and_van', 'fuel_type' which are factors that can help with our analysis.
These columns help us ask questions like
# Let's confirm the total number of rows and columns
adv.shape
(402005, 12)
# General information about the noshowappointments dataframe
adv.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 402005 entries, 0 to 402004 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 public_reference 402005 non-null int64 1 mileage 401878 non-null float64 2 reg_code 370148 non-null object 3 standard_colour 396627 non-null object 4 standard_make 402005 non-null object 5 standard_model 402005 non-null object 6 vehicle_condition 402005 non-null object 7 year_of_registration 368694 non-null float64 8 price 402005 non-null int64 9 body_type 401168 non-null object 10 crossover_car_and_van 402005 non-null bool 11 fuel_type 401404 non-null object dtypes: bool(1), float64(2), int64(2), object(7) memory usage: 34.1+ MB
The following actions have to be performed on the dataset columns:
# General description
adv.describe(include = 'all')
| public_reference | mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.020050e+05 | 401878.000000 | 370148 | 396627 | 402005 | 402005 | 402005 | 368694.000000 | 4.020050e+05 | 401168 | 402005 | 401404 |
| unique | NaN | NaN | 72 | 22 | 110 | 1168 | 2 | NaN | NaN | 16 | 2 | 9 |
| top | NaN | NaN | 17 | Black | BMW | Golf | USED | NaN | NaN | Hatchback | False | Petrol |
| freq | NaN | NaN | 36738 | 86287 | 37376 | 11583 | 370756 | NaN | NaN | 167315 | 400210 | 216929 |
| mean | 2.020071e+14 | 37743.595656 | NaN | NaN | NaN | NaN | NaN | 2015.006206 | 1.734197e+04 | NaN | NaN | NaN |
| std | 1.691662e+10 | 34831.724018 | NaN | NaN | NaN | NaN | NaN | 7.962667 | 4.643746e+04 | NaN | NaN | NaN |
| min | 2.013072e+14 | 0.000000 | NaN | NaN | NaN | NaN | NaN | 999.000000 | 1.200000e+02 | NaN | NaN | NaN |
| 25% | 2.020090e+14 | 10481.000000 | NaN | NaN | NaN | NaN | NaN | 2013.000000 | 7.495000e+03 | NaN | NaN | NaN |
| 50% | 2.020093e+14 | 28629.500000 | NaN | NaN | NaN | NaN | NaN | 2016.000000 | 1.260000e+04 | NaN | NaN | NaN |
| 75% | 2.020102e+14 | 56875.750000 | NaN | NaN | NaN | NaN | NaN | 2018.000000 | 2.000000e+04 | NaN | NaN | NaN |
| max | 2.020110e+14 | 999999.000000 | NaN | NaN | NaN | NaN | NaN | 2020.000000 | 9.999999e+06 | NaN | NaN | NaN |
Observing the dataframe, the public reference can be dropped as it has 402005 unique values.
# drop the columns you don't want to keep
adv = adv.drop('public_reference', axis=1)
adv.head(1)
| mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | NaN | Grey | Volvo | XC90 | NEW | NaN | 73970 | SUV | False | Petrol Plug-in Hybrid |
We can notice that there are missing values in the dataset, The missing values are in the following columns mileage, reg_code, standard_colour, year_of_registration, body_type, and fuel_type
# confirming all datatypes
adv.dtypes
mileage float64 reg_code object standard_colour object standard_make object standard_model object vehicle_condition object year_of_registration float64 price int64 body_type object crossover_car_and_van bool fuel_type object dtype: object
#the dataframe with null values
adv[adv.isna().any(axis=1)].shape
(38581, 11)
#checking for the total number of missing values in the dataset
adv.isnull().sum().sum()
72111
#checking for the missing values in each features
adv.isnull().sum().sort_values(ascending=False)
year_of_registration 33311 reg_code 31857 standard_colour 5378 body_type 837 fuel_type 601 mileage 127 standard_make 0 standard_model 0 vehicle_condition 0 price 0 crossover_car_and_van 0 dtype: int64
adv.year_of_registration.isnull().sum()
33311
There are 33311 missing values in the year of registration
It was Observed that
From the https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_the_United_Kingdom there exist a relationship between reg_code and year_of_registration.
it was also noticed that in the vehicle_condition, NEW vehicles do not have have year_of_registration and reg_code. To help with the analysis of the valuation of prices through the dataframe will with drop the NEW cars in the vehicle_condition which effectively drops the entire column
#subsetting the year of registration is null, reg_code is null, and NEW condition of vehicle
adv[adv['year_of_registration'].isnull() & (adv['vehicle_condition'] == 'NEW') & (adv['reg_code'].isnull())]
| mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | NaN | Grey | Volvo | XC90 | NEW | NaN | 73970 | SUV | False | Petrol Plug-in Hybrid |
| 17 | 5.0 | NaN | NaN | Nissan | X-Trail | NEW | NaN | 27595 | SUV | False | Diesel |
| 19 | 0.0 | NaN | White | Volkswagen | T-Cross | NEW | NaN | 25000 | SUV | False | Petrol |
| 37 | 0.0 | NaN | White | Fiat | Panda | NEW | NaN | 13999 | Hatchback | False | Petrol |
| 44 | 0.0 | NaN | NaN | Honda | Civic | NEW | NaN | 19495 | Hatchback | False | Petrol |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 401860 | 10.0 | NaN | Silver | Mitsubishi | Shogun Sport | NEW | NaN | 31999 | SUV | False | Diesel |
| 401890 | 5.0 | NaN | Red | BMW | Z4 | NEW | NaN | 47910 | Convertible | False | Petrol |
| 401902 | 10.0 | NaN | White | BMW | 3 Series | NEW | NaN | 35023 | Saloon | False | Petrol |
| 401905 | 0.0 | NaN | Red | Land Rover | Range Rover Evoque | NEW | NaN | 44995 | SUV | False | Petrol |
| 401971 | 10.0 | NaN | Grey | Nissan | Leaf | NEW | NaN | 28820 | Hatchback | False | Electric |
31249 rows × 11 columns
#subsetting and dropping the year of registration is null, reg_code is null and NEW condition of vehicle
adv.drop(adv.loc[(adv['year_of_registration'].isnull()) &
(adv['vehicle_condition'] == 'NEW') &
(adv['reg_code'].isnull())].index, inplace=True)
#Checking to confirm the subset has been dropped
adv[adv['year_of_registration'].isnull() & (adv['vehicle_condition'] == 'NEW') & (adv['reg_code'].isnull())]
| mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type |
|---|
#checking to confirm that there are no NEW vehicles in the vehicle_condition feature
adv.vehicle_condition.value_counts()
vehicle_condition USED 370756 Name: count, dtype: int64
#dropping vehicle condition as it contains all USED vehicles
adv = adv.drop(["vehicle_condition"], axis=1)
adv.info()
<class 'pandas.core.frame.DataFrame'> Index: 370756 entries, 1 to 402004 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mileage 370629 non-null float64 1 reg_code 370148 non-null object 2 standard_colour 366418 non-null object 3 standard_make 370756 non-null object 4 standard_model 370756 non-null object 5 year_of_registration 368694 non-null float64 6 price 370756 non-null int64 7 body_type 369975 non-null object 8 crossover_car_and_van 370756 non-null bool 9 fuel_type 370316 non-null object dtypes: bool(1), float64(2), int64(1), object(6) memory usage: 28.6+ MB
#checking where year of registration and mileage is null
#we cannot fill the missing values in year of registration because the we need to use the reg_code.
#reg code is null in this instance
adv[adv['year_of_registration'].isnull() & (adv['reg_code'].isnull())]
| mileage | reg_code | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1510 | 13406.0 | NaN | White | Land Rover | Range Rover Evoque | NaN | 26000 | Coupe | False | Diesel |
| 2631 | 1000.0 | NaN | Blue | Maserati | Levante | NaN | 63995 | SUV | False | Petrol |
| 4766 | NaN | NaN | Grey | Subaru | Outback | NaN | 35995 | Estate | False | Petrol |
| 6998 | 160.0 | NaN | Grey | McLaren | Senna | NaN | 699950 | Coupe | False | Petrol |
| 7517 | 11413.0 | NaN | NaN | MINI | Convertible | NaN | 14400 | Convertible | False | Diesel |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 392499 | 83555.0 | NaN | Black | Land Rover | Range Rover Sport | NaN | 30995 | SUV | False | Diesel |
| 392730 | 38796.0 | NaN | Grey | Rover | 110 | NaN | 3150 | Saloon | False | Petrol |
| 396985 | 29000.0 | NaN | Black | Lamborghini | Gallardo | NaN | 77990 | Coupe | False | Petrol |
| 399728 | 12812.0 | NaN | Silver | Rolls-Royce | Wraith | NaN | 159950 | Coupe | False | Petrol |
| 400536 | 40523.0 | NaN | Red | Peugeot | 108 | NaN | 5999 | Hatchback | False | Petrol |
321 rows × 10 columns
#we should drop the null values in year of registration where the regcode is null
adv.drop(adv[adv['year_of_registration'].isnull() &
(adv['reg_code'].isnull())].index,
inplace = True
)
#checking for the dropped rows
adv[adv['year_of_registration'].isnull() & (adv['reg_code'].isnull())]
| mileage | reg_code | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type |
|---|
#checking for how many null values are left in the year_of_Registration
adv.year_of_registration.isnull().sum()
1741
#taking a sample of 20 rows where reg code is 17 and comparing the values of year of registration
#subsetting to confirm relationship of reg_code and year of registration where reg_code == 17
adv[(adv['reg_code'] == '17')]
| mileage | reg_code | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 7800.0 | 17 | Grey | SKODA | Yeti | 2017.0 | 14000 | SUV | False | Petrol |
| 5 | 16000.0 | 17 | Blue | Audi | S5 | 2017.0 | 29000 | Convertible | False | Petrol |
| 6 | 24075.0 | 17 | Red | Vauxhall | Viva | 2017.0 | 5861 | Hatchback | False | Petrol |
| 11 | 24487.0 | 17 | Black | Peugeot | 208 | 2017.0 | 8795 | Hatchback | False | Petrol |
| 14 | 31534.0 | 17 | Grey | MINI | Hatch | 2017.0 | 19000 | Hatchback | False | Petrol |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 401959 | 33356.0 | 17 | White | Fiat | 500 | 2017.0 | 6495 | Hatchback | False | Petrol |
| 401962 | 20000.0 | 17 | Silver | Mazda | Mazda6 | 2017.0 | 12395 | Saloon | False | Petrol |
| 401977 | 28225.0 | 17 | White | Mercedes-Benz | C Class | 2017.0 | 18499 | Saloon | False | Diesel |
| 401981 | 30444.0 | 17 | Black | BMW | 4 Series Gran Coupe | 2017.0 | 18990 | Saloon | False | Diesel |
| 401985 | 42377.0 | 17 | White | Mazda | CX-5 | 2017.0 | 17900 | SUV | False | Diesel |
36738 rows × 10 columns
As shown above there is a relationship between year of registration and reg_code, as the value of reg_code == 17 is 2017 on year_of_registration
#creating a dictionary using the reg_code column and the year_of_registration
#Create a dictionary mapping the values in column 'reg_code'
#to the corresponding values in column 'year_of_registration'
mapping_dict = adv.set_index('reg_code')['year_of_registration'].to_dict()
mapping_dict
{'61': 2011.0,
'17': 2017.0,
'16': 2016.0,
'64': 2014.0,
'13': 2013.0,
'08': 2008.0,
'19': 2019.0,
'60': 2011.0,
'69': 2019.0,
'66': 2016.0,
'12': 2012.0,
'18': 2018.0,
'65': 2015.0,
'09': 2009.0,
'B': 1964.0,
'10': 2010.0,
'14': 2014.0,
'03': 2003.0,
'67': 2017.0,
'63': 2014.0,
'15': 2015.0,
'68': 2018.0,
'56': 2006.0,
'20': 2020.0,
'70': 2020.0,
'62': 2012.0,
'11': 2011.0,
'05': 2005.0,
'59': 2009.0,
'58': 2009.0,
'W': 2000.0,
'02': 2002.0,
'57': 2007.0,
'06': 2006.0,
'52': 2002.0,
'53': 2003.0,
'04': 2004.0,
'J': 1992.0,
'07': 2007.0,
'55': 2005.0,
'51': 2002.0,
nan: 2020.0,
'X': 2000.0,
'D': nan,
'S': 1998.0,
'Y': 2001.0,
'54': 2005.0,
'H': 1991.0,
'K': 1972.0,
'E': 1988.0,
'L': 1973.0,
'V': 2000.0,
'M': 2019.0,
'G': 1968.0,
'T': 1999.0,
'N': 1995.0,
'P': 1997.0,
'A': 1963.0,
'F': 1989.0,
'R': 1998.0,
'C': 1965.0,
'94': nan,
'85': nan,
'CA': nan,
'm': 2005.0,
'FW': 1934.0,
'723xuu': nan,
'95': nan,
's': 2001.0,
'38': nan,
'k': 1992.0,
'37': nan,
'p': 1957.0}
# Fill null values in column 'year of registration' using the mapping_dict
adv['year_of_registration'] = adv['year_of_registration'].fillna(adv['reg_code'].map(mapping_dict))
#checking the null values left in year
adv.year_of_registration.isnull().sum()
12
#lets see the null values left in the year of registration
adv[adv['year_of_registration'].isnull()]
| mileage | reg_code | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9940 | 32001.0 | 94 | Orange | Renault | Captur | NaN | 7400 | SUV | False | Petrol |
| 28559 | 58686.0 | 85 | Black | Hyundai | i30 | NaN | 6900 | NaN | False | Diesel |
| 80899 | 2000.0 | CA | Green | Caterham | Seven | NaN | 21995 | Convertible | False | Petrol |
| 166035 | 4050.0 | 723xuu | Turquoise | Chevrolet | Corvette | NaN | 89990 | NaN | False | Petrol |
| 167430 | 55643.0 | 95 | Grey | Citroen | DS4 | NaN | 6800 | Hatchback | False | Diesel |
| 231213 | 61370.0 | 94 | Black | Vauxhall | Mokka | NaN | 7450 | NaN | False | Diesel |
| 265390 | 23157.0 | 38 | Black | Mercedes-Benz | E Class | NaN | 26550 | NaN | False | Diesel |
| 288265 | 36500.0 | D | NaN | Aston Martin | DB6 | NaN | 365000 | Saloon | False | Petrol |
| 329449 | 39312.0 | 95 | Silver | Renault | Captur | NaN | 8000 | SUV | False | Petrol |
| 339049 | 70986.0 | D | Red | Jaguar | Mark II | NaN | 23990 | Saloon | False | Petrol |
| 357325 | 31487.0 | 37 | White | Suzuki | Baleno | NaN | 6950 | NaN | False | Petrol |
| 398645 | 22312.0 | D | Silver | Lamborghini | Diablo | NaN | 139750 | Coupe | False | Petrol |
The cars that have their year of registration pointing to the future will be dropped as we are can see from the wikipedia website about UK vehicle registration
leftover_dict = {'D': 1964,
'CA': 1974
}
leftover_dict
{'D': 1964, 'CA': 1974}
# Fill null values in column 'year of registration' using the mapping_dict
adv['year_of_registration'] = adv['year_of_registration'].fillna(adv['reg_code'].map(leftover_dict))
#we should drop the null values in year of registration where the regcode is directing us to futuristic date
adv.drop(adv[adv['year_of_registration'].isnull()].index,
inplace = True
)
#checking
adv.year_of_registration.isnull().sum()
0
#Total number of null values in reg_code
adv.reg_code.isnull().sum()
287
There are 287 missing values in the reg_code
In the case of the features year_of_registration and reg_code in a dataframe, they are providing the same information in the dataframe. This can cause problems in the data analysis because they may not be able to distinguish the unique contributions of each variable.
To address this problem, we will remove the reg_code.
#dropping reg_code as it contains similar or the same information as year_of_registration
adv = adv.drop(["reg_code"], axis=1)
adv.info()
<class 'pandas.core.frame.DataFrame'> Index: 370427 entries, 1 to 402004 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mileage 370312 non-null float64 1 standard_colour 366115 non-null object 2 standard_make 370427 non-null object 3 standard_model 370427 non-null object 4 year_of_registration 370427 non-null float64 5 price 370427 non-null int64 6 body_type 369710 non-null object 7 crossover_car_and_van 370427 non-null bool 8 fuel_type 369995 non-null object dtypes: bool(1), float64(2), int64(1), object(5) memory usage: 25.8+ MB
reg_code has been dropped!!!
adv.isnull().sum().sort_values(ascending=False)
standard_colour 4312 body_type 717 fuel_type 432 mileage 115 standard_make 0 standard_model 0 year_of_registration 0 price 0 crossover_car_and_van 0 dtype: int64
Missing values can be filled with the mode (most common value) when the data is categorical or ordinal in nature. This is because the mode represents the value that occurs most frequently in the dataset, which makes it a good estimate for missing values. However, it is important to consider the context and specific characteristics of the dataset before filling missing values with the mode, as it may not always be the best approach.
def filler(data, used_col, col_null, usercase = True):
if usercase is True:
# create new column with mode of for each country, where there is more than one mode, pick the first
data['filled'] = data.groupby(used_col)[col_null].transform(lambda x: x.mode().get(0, np.nan))
# replace null values in content_rating with mode of the country
data[col_null] = np.where(data[col_null].isnull(), data['filled'], data[col_null])
data.drop('filled', axis=1, inplace=True)
else:
data[col_null] = df[col_null].fillna(df.groupby(used_col)[col_null].transform('mean'))
This code defines a function called "filler" that takes in four parameters: "data", "used_col", "col_null", and "usercase". The "data" parameter is a DataFrame that the function will be applied to. The "used_col" parameter is a column in the DataFrame that will be used to group the data for calculating the mode or mean. The "col_null" parameter is the column in the DataFrame that will have its null values filled in. The "usercase" parameter is a Boolean that determines whether to fill in the null values in "col_null" with the mode of the group or the mean of the group.
If "usercase" is set to True, the function will first create a new column in the DataFrame called "filled" that contains the mode of the values in "col_null" for each group defined by "used_col". Then it will replace any null values in "col_null" with the corresponding value in the "filled" column. Finally, it will drop the "filled" column.
If "usercase" is set to False, the function will fill the null values in "col_null" with the mean of the group defined by "used_col".
This code can be used to fill in missing data in a DataFrame by using the mode or mean of the data for a specific group.
#Dealing with standard_colour null values
#checking the total number of null values present in standard_colour
adv.standard_colour.isnull().sum()
4312
#using the filler function to fill standard_colour
filler(adv, 'standard_make', 'standard_colour')
#check for null values
adv.standard_colour.isnull().sum()
2
#Dealing with body_type null values
#checking the total number of null values present in body_type
adv.body_type.isnull().sum()
717
#using the filler function to fill body_type
filler(adv, 'standard_make', 'body_type')
#check for null values
adv.body_type.isnull().sum()
5
The standard_colour and the body_type have 2 and 5 null values respectively. this null values are cannot be filled by either standard make and model hence lets fill it by using mode in both cases.
adv['standard_colour'].fillna(adv['standard_colour'].mode()[0], inplace=True)
adv['body_type'].fillna(adv['body_type'].mode()[0], inplace=True)
adv.isnull().sum().sort_values(ascending=False)
fuel_type 432 mileage 115 standard_colour 0 standard_make 0 standard_model 0 year_of_registration 0 price 0 body_type 0 crossover_car_and_van 0 dtype: int64
#Dealing with fuel_type null values
#checking the total number of null values present in fuel_type
adv.fuel_type.isnull().sum()
432
#using the filler function to fill fuel_type
filler(adv, 'body_type', 'fuel_type')
#check for null values
adv.fuel_type.isnull().sum()
0
#Dealing with mileage null values
#checking the total number of null values present in mileage
adv.mileage.isnull().sum()
115
#using the filler function to fill mileage
filler(adv, 'year_of_registration', 'mileage', usercase = False)
#check for null values
adv.mileage.isnull().sum()
7
adv[adv['mileage'].isnull()]
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 26974 | NaN | Blue | Mercedes-Benz | E Class | 2011.0 | 7950 | Coupe | False | Diesel |
| 56864 | NaN | Grey | Land Rover | Defender 110 | 2020.0 | 49950 | SUV | False | Diesel |
| 218201 | NaN | Grey | Volkswagen | Beetle | 2005.0 | 1199 | Hatchback | False | Petrol |
| 298732 | NaN | Purple | Vauxhall | Astra | 1968.0 | 4999 | Convertible | False | Petrol |
| 345529 | NaN | White | Fiat | 500 | 2019.0 | 8995 | Hatchback | False | Petrol |
| 351673 | NaN | Red | Land Rover | Defender 110 | 2001.0 | 12250 | Pickup | False | Diesel |
| 390372 | NaN | Grey | Subaru | Levorg | 2020.0 | 29995 | Estate | False | Petrol |
adv['mileage'] = adv['mileage'].fillna(adv['mileage'].mean())
#check for null values
adv.mileage.isnull().sum()
0
adv.isnull().sum().sort_values(ascending=False)
mileage 0 standard_colour 0 standard_make 0 standard_model 0 year_of_registration 0 price 0 body_type 0 crossover_car_and_van 0 fuel_type 0 dtype: int64
All missing values have been sufficiently dealt with
adv.describe()
| mileage | year_of_registration | price | |
|---|---|---|---|
| count | 370427.000000 | 370427.000000 | 3.704270e+05 |
| mean | 40929.823536 | 2015.010329 | 1.584210e+04 |
| std | 34430.452917 | 7.955420 | 2.536311e+04 |
| min | 0.000000 | 999.000000 | 1.200000e+02 |
| 25% | 14454.000000 | 2013.000000 | 6.999000e+03 |
| 50% | 31867.000000 | 2016.000000 | 1.189000e+04 |
| 75% | 60000.000000 | 2018.000000 | 1.850000e+04 |
| max | 999999.000000 | 2020.000000 | 3.799995e+06 |
The outliers are most found in numerical data such as continuous variables (e.g. mileage, price) or discrete variables (e.g. count data). Outliers can have a large impact on the analysis and results of a dataset, and can skew the overall distribution of the data.
Outliers can be caused by various factors such as measurement error, data entry errors, or genuine extreme cases. It's important to identify and handle outliers appropriately, as they can have a significant impact on the statistical properties of a dataset, such as the mean, median, and standard deviation.
in our case we can see:
lets visualize:
# Quick visualization of the year_of_registration, mileage, and price
plt.figure(figsize=(16, 10))
# Boxplot of year_of_registration
plt.subplot(3, 1, 1)
sns.boxplot(x=adv['year_of_registration'])
plt.xlabel('Year of Registration')
# Boxplot of mileage
plt.subplot(3, 1, 2)
sns.boxplot(x=adv['mileage'])
plt.xlabel('Mileage')
# Boxplot of price
plt.subplot(3, 1, 3)
sns.boxplot(x=adv['price'])
plt.xlabel('Price')
plt.tight_layout() # Adjust spacing between subplots
plt.show()
adv[adv['year_of_registration'] < 1900].shape
(17, 9)
It can be noticed that while observing the boxplot for year of registration, so vehicles registration dates were below the year 1900. The total count of noise in the year of registration is 17
# checking for noise and error values in the year of registration
adv[adv['year_of_registration'] < 1900]
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 59010 | 14000.0 | Blue | Toyota | Prius | 1007.0 | 7000 | Hatchback | False | Petrol Hybrid |
| 69516 | 96659.0 | Black | Audi | A4 Avant | 1515.0 | 10385 | Estate | False | Diesel |
| 84501 | 37771.0 | Black | Smart | fortwo | 1063.0 | 4785 | Coupe | False | Petrol |
| 114737 | 30000.0 | Red | Toyota | AYGO | 1009.0 | 4695 | Hatchback | False | Petrol |
| 120858 | 27200.0 | Black | MINI | Clubman | 1016.0 | 18990 | Estate | False | Diesel |
| 190556 | 58470.0 | Black | Fiat | Punto Evo | 1010.0 | 3785 | Hatchback | False | Petrol |
| 199830 | 23000.0 | Silver | MINI | Hatch | 1009.0 | 5995 | Hatchback | False | Petrol |
| 199987 | 104000.0 | Silver | BMW | 1 Series | 1008.0 | 4395 | Convertible | False | Petrol |
| 201616 | 8600.0 | Silver | BMW | M2 | 1018.0 | 41990 | Coupe | False | Petrol |
| 201626 | 69346.0 | Red | Mazda | Mazda3 | 999.0 | 8795 | Saloon | False | Petrol |
| 201773 | 19000.0 | Silver | Mercedes-Benz | C Class | 1007.0 | 7295 | Saloon | False | Petrol |
| 213059 | 54569.0 | Silver | BMW | Z4 | 999.0 | 7999 | Convertible | False | Petrol |
| 274622 | 6353.0 | Blue | BMW | 2 Series | 1017.0 | 18990 | Convertible | False | Petrol |
| 334679 | 107934.0 | Blue | Audi | A3 | 999.0 | 8895 | Hatchback | False | Diesel |
| 339167 | 38000.0 | White | MINI | Hatch | 1007.0 | 4795 | Hatchback | False | Petrol |
| 374798 | 58000.0 | Silver | Toyota | RAV4 | 1006.0 | 5994 | SUV | False | Petrol |
| 387633 | 39624.0 | Red | MINI | Clubman | 1015.0 | 15290 | Estate | False | Petrol |
#Reg code was droppped previously during cleaning,
#we need it to deal with the error values in year of registration
df[df['year_of_registration'] < 1900]
| public_reference | mileage | reg_code | standard_colour | standard_make | standard_model | vehicle_condition | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 59010 | 202006270588110 | 14000.0 | 07 | Blue | Toyota | Prius | USED | 1007.0 | 7000 | Hatchback | False | Petrol Hybrid |
| 69516 | 202010155035879 | 96659.0 | 65 | Black | Audi | A4 Avant | USED | 1515.0 | 10385 | Estate | False | Diesel |
| 84501 | 202009163810376 | 37771.0 | 63 | Black | Smart | fortwo | USED | 1063.0 | 4785 | Coupe | False | Petrol |
| 114737 | 202008102305925 | 30000.0 | 59 | Red | Toyota | AYGO | USED | 1009.0 | 4695 | Hatchback | False | Petrol |
| 120858 | 202010064654489 | 27200.0 | 66 | Black | MINI | Clubman | USED | 1016.0 | 18990 | Estate | False | Diesel |
| 190556 | 202010205206488 | 58470.0 | 10 | Black | Fiat | Punto Evo | USED | 1010.0 | 3785 | Hatchback | False | Petrol |
| 199830 | 202009013167637 | 23000.0 | 59 | Silver | MINI | Hatch | USED | 1009.0 | 5995 | Hatchback | False | Petrol |
| 199987 | 202010225311657 | 104000.0 | 08 | Silver | BMW | 1 Series | USED | 1008.0 | 4395 | Convertible | False | Petrol |
| 201616 | 202010134937656 | 8600.0 | 68 | Silver | BMW | M2 | USED | 1018.0 | 41990 | Coupe | False | Petrol |
| 201626 | 202010155037484 | 69346.0 | 64 | Red | Mazda | Mazda3 | USED | 999.0 | 8795 | Saloon | False | Petrol |
| 201773 | 202008042076716 | 19000.0 | 57 | Silver | Mercedes-Benz | C Class | USED | 1007.0 | 7295 | Saloon | False | Petrol |
| 213059 | 202009304380359 | 54569.0 | 08 | Silver | BMW | Z4 | USED | 999.0 | 7999 | Convertible | False | Petrol |
| 274622 | 202010024511934 | 6353.0 | 17 | Blue | BMW | 2 Series | USED | 1017.0 | 18990 | Convertible | False | Petrol |
| 334679 | 202010094789497 | 107934.0 | 13 | Blue | Audi | A3 | USED | 999.0 | 8895 | Hatchback | False | Diesel |
| 339167 | 202006270588115 | 38000.0 | 57 | White | MINI | Hatch | USED | 1007.0 | 4795 | Hatchback | False | Petrol |
| 374798 | 202008042076722 | 58000.0 | 55 | Silver | Toyota | RAV4 | USED | 1006.0 | 5994 | SUV | False | Petrol |
| 387633 | 202010195174849 | 39624.0 | 65 | Red | MINI | Clubman | USED | 1015.0 | 15290 | Estate | False | Petrol |
#using functions and mapping dictionary to replace value in the column
def replace_value(df, mapping_dict):
"""
Replace the values in the 'year_of_registration' column of the DataFrame using a mapping dictionary
Parameters:
df (DataFrame): Dataframe which needs to be modified
mapping_dict (dict): Dictionary containing the mapping of old values to new values
Returns:
DataFrame : Modified Dataframe
"""
# Replace the values in the year of registration column using the mapping dictionary
df['year_of_registration'].replace(mapping_dict, inplace=True)
return df
#creating a mapping dictionary using the regcode from the UK vehicle registration wikipedia link
fix_error_dict = {
1016.0: 2016, #66
1017.0: 2017,
1006.0: 2006,
999.0: 2014, #13
1515.0: 2015, #65
1007.0: 2007, #57
1008.0: 2008, #08
999.0: 2008, #08
1018.0: 2018, #68
1010.0: 2010, #10
1015.0: 2015, #65
1009.0: 2009, #59
1063.0: 2013, #63
999.0: 2014, #64
}
fix_error_dict
{1016.0: 2016,
1017.0: 2017,
1006.0: 2006,
999.0: 2014,
1515.0: 2015,
1007.0: 2007,
1008.0: 2008,
1018.0: 2018,
1010.0: 2010,
1015.0: 2015,
1009.0: 2009,
1063.0: 2013}
#fixing the error in the year of registration column that has years less than 1900
adv = replace_value(adv, fix_error_dict)
#check
adv.loc[(adv['year_of_registration'] < 1900)]
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type |
|---|
adv.shape
(370427, 9)
# Step 1: Calculate the interquartile range
numeric_cols = adv.select_dtypes(include=[np.number]).columns # select only numeric columns
Q1 = adv[numeric_cols].quantile(0.25) # first quartile
Q3 = adv[numeric_cols].quantile(0.75) # third quartile
IQR = Q3 - Q1 # interquartile range
# Step 2: Identify the outliers
outliers = ((adv[numeric_cols] < (Q1 - 1.5 * IQR)) | (adv[numeric_cols] > (Q3 + 1.5 * IQR)))
# Step 3: remove them from the dataset
adv = adv[~outliers.any(axis=1)] # remove outliers from the dataset
# Step 4: check the shape of the dataset
adv.shape
(332102, 9)
lets visualize the data
# Quick visualization of the year_of_registration, mileage, and price
plt.figure(figsize=(16, 10))
# Boxplot of year_of_registration
plt.subplot(3, 1, 1)
sns.boxplot(x=adv['year_of_registration'])
plt.xlabel('Year of Registration')
# Boxplot of mileage
plt.subplot(3, 1, 2)
sns.boxplot(x=adv['mileage'])
plt.xlabel('Mileage')
# Boxplot of price
plt.subplot(3, 1, 3)
sns.boxplot(x=adv['price'])
plt.xlabel('Price')
plt.tight_layout() # Adjust spacing between subplots
plt.show()
adv.sort_values('price', ascending=False).head(1)
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 231908 | 21000.0 | Silver | Land Rover | Range Rover Evoque | 2018.0 | 35750 | Convertible | False | Diesel |
adv.sort_values('year_of_registration', ascending=False).head(1)
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 142569 | 2238.0 | Grey | Fiat | 500 | 2020.0 | 9299 | Hatchback | False | Petrol |
adv.sort_values('mileage', ascending=False).head(1)
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 397621 | 128300.0 | Silver | Vauxhall | Astra | 2012.0 | 2450 | Hatchback | False | Diesel |
We have cleaned up our dataframe. We can now explore the dataframe by performing some Data Transformations
# Convert the year_of_registration column to integer
adv['year_of_registration'] = adv['year_of_registration'].astype(int)
adv.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van bool fuel_type object dtype: object
adv.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol |
#checking for the max and min of year of registration
adv['year_of_registration'].min(), adv['year_of_registration'].max()
(2006, 2020)
adv.year_of_registration.describe()
count 332102.000000 mean 2015.467853 std 3.320016 min 2006.000000 25% 2014.000000 50% 2016.000000 75% 2018.000000 max 2020.000000 Name: year_of_registration, dtype: float64
# Create a new column 'year_of_registration_category'
adv['condition'] = pd.cut(adv['year_of_registration'],
bins=[2005,2010,2015,2018,2021],
labels=['VERY OLD', 'OLD','FAIRLY NEW', 'NEW']
)
print(adv.dtypes)
adv.head()
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van bool fuel_type object condition category dtype: object
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | condition | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel | OLD |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol | FAIRLY NEW |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel | FAIRLY NEW |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel | OLD |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol | FAIRLY NEW |
# Convert the new vehicle_condition column to object
adv['condition'] = adv['condition'].astype('object')
adv.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van bool fuel_type object condition object dtype: object
To further enhance our analysis, we derived a categorical variable based on the "year_of_registration" column. This transformation allows us to capture the age or vintage of the vehicles in a more intuitive and meaningful way.
First, we examined the range of values in the "year_of_registration" column. The minimum and maximum values were found to be 2006 and 2020, respectively. The descriptive statistics of the column indicated a mean of 2015.47, with a standard deviation of 3.32. The data distribution revealed that the majority of vehicles fell within the years 2014 to 2018, with the median year of registration being 2016.
To create the categorical variable, we segmented the years of registration into four distinct categories: "VERY OLD," "OLD," "FAIRLY NEW," and "NEW." We used the pandas cut() function to assign each year of registration to the corresponding category based on predefined bins. The bin ranges were defined as [2005, 2010, 2015, 2018, 2021], and the labels for each category were set accordingly.
After creating the new column, named "condition," we confirmed its inclusion in the dataset by examining the updated data types. The "condition" column was successfully added as a categorical variable, denoted by the "category" data type. The dataset preview displayed the newly created column alongside the existing features.
To ensure consistency in the data type, we converted the "condition" column from the categorical type to the object type, aligning it with the other object-type features in the dataset. This conversion allows for seamless integration and compatibility in subsequent data analysis and modeling processes.
By incorporating this derived categorical variable, we gain a more comprehensive understanding of the vehicles' age distribution, which can provide valuable insights and improve the accuracy of our analysis.
adv['mileage'].min(), adv['mileage'].max()
(0.0, 128300.0)
adv['mileage'].describe()
count 332102.000000 mean 38946.129769 std 29736.761968 min 0.000000 25% 15130.250000 50% 31761.500000 75% 57525.000000 max 128300.000000 Name: mileage, dtype: float64
# Create a new column 'usuage'
adv['usage'] = pd.cut(adv['mileage'],
bins=[0.0,30000.0,60000.0,130000.0],
labels=['LOW','AVERAGE','HIGH'],
right=False,
include_lowest=True
)
adv.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | condition | usage | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel | OLD | HIGH |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol | FAIRLY NEW | LOW |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel | FAIRLY NEW | AVERAGE |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel | OLD | HIGH |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol | FAIRLY NEW | LOW |
# Convert the new usage column to object
adv['usage'] = adv['usage'].astype('object')
adv.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van bool fuel_type object condition object usage object dtype: object
adv.columns
Index(['mileage', 'standard_colour', 'standard_make', 'standard_model',
'year_of_registration', 'price', 'body_type', 'crossover_car_and_van',
'fuel_type', 'condition', 'usage'],
dtype='object')
The code is performing feature engineering to derive a new feature called 'usage' based on the 'mileage' feature. The goal is to categorize the mileage of the vehicles into three categories: 'LOW', 'AVERAGE', and 'HIGH'.
First, the code is checking the minimum and maximum values of the 'mileage' column to determine the range of values in the dataset.
Then, a new column called 'usage' is created in the 'adv' DataFrame using the cut() method. This method creates a new categorical variable based on the values of an existing numerical variable. In this case, the cut() method is dividing the 'mileage' column into three categories based on the range of values provided in the bins parameter. The labels parameter specifies the labels to use for each category.
The resulting 'usage' column is added to the DataFrame using the square bracket notation. The head() method is used to display the first few rows of the DataFrame to verify that the new 'usage' column has been added correctly.
Finally, the astype() method is used to convert the 'usage' column from a categorical variable to an object data type. This is done to ensure compatibility with certain machine learning models that require all columns to be numeric or object data types.
Overall, this feature engineering approach can be useful for understanding the relationship between the mileage of a vehicle and its effectiveness or value. For instance, vehicles with low mileage may be more valuable or effective than those with high mileage because they are perceived to have more useful life remaining. On the other hand, high mileage vehicles may be considered less valuable or effective due to the perception that they may have more wear and tear.
Deriving a new feature called 'usage' from the 'mileage' feature provides a way to capture the usage aspect of the data in a more interpretable and meaningful way.
# Manufacturer Popularity
manufacturer_popularity = adv['standard_make'].value_counts(normalize=True)
manufacturer_popularity
standard_make
Vauxhall 0.095558
BMW 0.094784
Volkswagen 0.088801
Audi 0.086835
Mercedes-Benz 0.078934
...
Replica 0.000003
Plymouth 0.000003
Ariel 0.000003
Pilgrim 0.000003
Custom Vehicle 0.000003
Name: proportion, Length: 75, dtype: float64
manufacturer_popularity.min(), manufacturer_popularity.max()
(3.0111230886896192e-06, 0.09555799121956507)
manufacturer_popularity.describe()
count 75.000000 mean 0.013333 std 0.024209 min 0.000003 25% 0.000009 50% 0.000608 75% 0.019907 max 0.095558 Name: proportion, dtype: float64
adv['manufacturer_popularity'] = adv['standard_make'].map(manufacturer_popularity)
adv.head(10)
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel | OLD | HIGH | 0.015637 |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol | FAIRLY NEW | LOW | 0.031587 |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel | FAIRLY NEW | AVERAGE | 0.095558 |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel | OLD | HIGH | 0.024086 |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol | FAIRLY NEW | LOW | 0.086835 |
| 6 | 24075.0 | Red | Vauxhall | Viva | 2017 | 5861 | Hatchback | False | Petrol | FAIRLY NEW | LOW | 0.095558 |
| 7 | 99000.0 | Bronze | Land Rover | Range Rover | 2013 | 24475 | SUV | False | Diesel | OLD | HIGH | 0.024086 |
| 8 | 111236.0 | Black | Mercedes-Benz | S Class | 2008 | 6995 | Limousine | False | Diesel | VERY OLD | HIGH | 0.078934 |
| 10 | 77000.0 | Grey | Volkswagen | Passat | 2010 | 4499 | Saloon | False | Diesel | VERY OLD | HIGH | 0.088801 |
| 11 | 24487.0 | Black | Peugeot | 208 | 2017 | 8795 | Hatchback | False | Petrol | FAIRLY NEW | LOW | 0.042243 |
# Define the bin edges based on percentiles
bins = [0, 0.020, 0.05, 0.075, 0.099]
# Define the corresponding labels for each bin
labels = ['Low', 'Medium', 'High', 'Very High']
# Create a new categorical column based on manufacturer popularity
adv['manufacturer_popularity'] = pd.cut(adv['manufacturer_popularity'], bins=bins, labels=labels, include_lowest=True)
# Covert feature to object
adv['manufacturer_popularity'] = adv['manufacturer_popularity'].astype('object')
adv.dtypes
# Display the modified DataFrame
adv.head(10)
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel | OLD | HIGH | Low |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol | FAIRLY NEW | LOW | Medium |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel | FAIRLY NEW | AVERAGE | Very High |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel | OLD | HIGH | Medium |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol | FAIRLY NEW | LOW | Very High |
| 6 | 24075.0 | Red | Vauxhall | Viva | 2017 | 5861 | Hatchback | False | Petrol | FAIRLY NEW | LOW | Very High |
| 7 | 99000.0 | Bronze | Land Rover | Range Rover | 2013 | 24475 | SUV | False | Diesel | OLD | HIGH | Medium |
| 8 | 111236.0 | Black | Mercedes-Benz | S Class | 2008 | 6995 | Limousine | False | Diesel | VERY OLD | HIGH | Very High |
| 10 | 77000.0 | Grey | Volkswagen | Passat | 2010 | 4499 | Saloon | False | Diesel | VERY OLD | HIGH | Very High |
| 11 | 24487.0 | Black | Peugeot | 208 | 2017 | 8795 | Hatchback | False | Petrol | FAIRLY NEW | LOW | Medium |
# Covert feature to object
adv['manufacturer_popularity'] = adv['manufacturer_popularity'].astype('object')
adv.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van bool fuel_type object condition object usage object manufacturer_popularity object dtype: object
In order to derive a feature based on domain knowledge, we created a column called "manufacturer_popularity" to analyze the popularity of different car manufacturers. The proportion of each manufacturer within the dataset was calculated using the "standard_make" column. The results showed the distribution of manufacturer popularity, ranging from a minimum of 3.01e-06 to a maximum of 0.0956.
To categorize the manufacturer popularity, we defined bin edges based on percentiles and assigned labels to each bin. The manufacturers were then classified into four categories: "Low," "Medium," "High," and "Very High" based on their popularity.
The modified dataset displayed the newly created "manufacturer_popularity" column, representing the categorized popularity of each manufacturer. Additionally, the "manufacturer_popularity" column was converted to the object type for consistency.
To ensure data consistency, we also converted the "crossover_car_and_van" column to the object type.
The dataset was thoroughly checked for missing values, and no missing values were found in any of the columns.
Overall, the "manufacturer_popularity" feature provides insights into the popularity of different car manufacturers, enabling further analysis and comparisons in the dataset.
# Convert the hybrid column to object
adv['crossover_car_and_van'] = adv['crossover_car_and_van'].astype('object')
adv.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 price int64 body_type object crossover_car_and_van object fuel_type object condition object usage object manufacturer_popularity object dtype: object
adv.isnull().sum().sort_values(ascending=False)
mileage 0 standard_colour 0 standard_make 0 standard_model 0 year_of_registration 0 price 0 body_type 0 crossover_car_and_van 0 fuel_type 0 condition 0 usage 0 manufacturer_popularity 0 dtype: int64
adv.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | price | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | 7000 | Saloon | False | Diesel | OLD | HIGH | Low |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | 14000 | SUV | False | Petrol | FAIRLY NEW | LOW | Medium |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | 7995 | Hatchback | False | Diesel | FAIRLY NEW | AVERAGE | Very High |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | 26995 | SUV | False | Diesel | OLD | HIGH | Medium |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | 29000 | Convertible | False | Petrol | FAIRLY NEW | LOW | Very High |
adv.info()
<class 'pandas.core.frame.DataFrame'> Index: 332102 entries, 1 to 402004 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mileage 332102 non-null float64 1 standard_colour 332102 non-null object 2 standard_make 332102 non-null object 3 standard_model 332102 non-null object 4 year_of_registration 332102 non-null int64 5 price 332102 non-null int64 6 body_type 332102 non-null object 7 crossover_car_and_van 332102 non-null object 8 fuel_type 332102 non-null object 9 condition 332102 non-null object 10 usage 332102 non-null object 11 manufacturer_popularity 332102 non-null object dtypes: float64(1), int64(2), object(9) memory usage: 32.9+ MB
# spliting the data into predictors and target
X, y = adv.drop(columns='price'), adv['price']
print("The shape of X:",X.shape)
print("The shape of y:",y.shape)
The shape of X: (332102, 11) The shape of y: (332102,)
X.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 108230.0 | Blue | Jaguar | XF | 2011 | Saloon | False | Diesel | OLD | HIGH | Low |
| 2 | 7800.0 | Grey | SKODA | Yeti | 2017 | SUV | False | Petrol | FAIRLY NEW | LOW | Medium |
| 3 | 45000.0 | Brown | Vauxhall | Mokka | 2016 | Hatchback | False | Diesel | FAIRLY NEW | AVERAGE | Very High |
| 4 | 64000.0 | Grey | Land Rover | Range Rover Sport | 2015 | SUV | False | Diesel | OLD | HIGH | Medium |
| 5 | 16000.0 | Blue | Audi | S5 | 2017 | Convertible | False | Petrol | FAIRLY NEW | LOW | Very High |
y.head()
1 7000 2 14000 3 7995 4 26995 5 29000 Name: price, dtype: int64
# Split the dataset into training and test sets
# X represents the feature matrix and y represents the target variable
# test_size is set to 0.25, meaning that 25% of the data will be used for testing
# random_state ensures that the same random split is generated every time the code is run
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40623)
# Lets print the shapes of the train and test sets for the features and target variable
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
X_train shape: (249076, 11) X_test shape: (83026, 11) y_train shape: (249076,) y_test shape: (83026,)
X_train.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 64508 | 46600.0 | Red | Nissan | Juke | 2012 | SUV | False | Petrol | OLD | AVERAGE | High |
| 41887 | 36617.0 | Black | BMW | 3 Series | 2016 | Saloon | False | Petrol Plug-in Hybrid | FAIRLY NEW | AVERAGE | Very High |
| 219334 | 14685.0 | White | Nissan | Leaf | 2014 | Hatchback | False | Electric | OLD | LOW | High |
| 130303 | 21207.0 | Orange | Suzuki | Ignis | 2017 | Hatchback | False | Petrol | FAIRLY NEW | LOW | Low |
| 358615 | 82078.0 | Grey | BMW | 1 Series | 2014 | Hatchback | False | Diesel | OLD | HIGH | Very High |
X_train.columns
Index(['mileage', 'standard_colour', 'standard_make', 'standard_model',
'year_of_registration', 'body_type', 'crossover_car_and_van',
'fuel_type', 'condition', 'usage', 'manufacturer_popularity'],
dtype='object')
X_train.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 body_type object crossover_car_and_van object fuel_type object condition object usage object manufacturer_popularity object dtype: object
columns_to_count = ['standard_colour',
'standard_make',
'standard_model',
'body_type',
'crossover_car_and_van',
'fuel_type',
'condition',
'usage',
'manufacturer_popularity',
]
for column in columns_to_count:
print(f"Value counts for column {column}:")
print(adv[column].value_counts())
print()
Value counts for column standard_colour:
standard_colour
Black 72936
White 62200
Grey 54918
Blue 48724
Silver 40723
Red 36743
Orange 3294
Green 3062
Brown 1847
Yellow 1564
Beige 1331
Bronze 1256
Multicolour 1134
Purple 1044
Gold 604
Pink 284
Turquoise 239
Maroon 127
Burgundy 53
Magenta 14
Navy 4
Indigo 1
Name: count, dtype: int64
Value counts for column standard_make:
standard_make
Vauxhall 31735
BMW 31478
Volkswagen 29491
Audi 28838
Mercedes-Benz 26214
...
Replica 1
Plymouth 1
Ariel 1
Pilgrim 1
Custom Vehicle 1
Name: count, Length: 75, dtype: int64
Value counts for column standard_model:
standard_model
Golf 10288
Corsa 10016
C Class 7512
3 Series 7251
Qashqai 7200
...
Series 3 1
Chevy 1
TXII 1
Crossline 1
300 1
Name: count, Length: 759, dtype: int64
Value counts for column body_type:
body_type
Hatchback 150632
SUV 87982
Saloon 30017
Estate 21027
Coupe 16199
MPV 14616
Convertible 10788
Pickup 419
Combi Van 151
Minibus 117
Limousine 62
Panel Van 40
Window Van 28
Camper 22
Car Derived Van 1
Chassis Cab 1
Name: count, dtype: int64
Value counts for column crossover_car_and_van:
crossover_car_and_van
False 330974
True 1128
Name: count, dtype: int64
Value counts for column fuel_type:
fuel_type
Petrol 176535
Diesel 138160
Petrol Hybrid 10922
Petrol Plug-in Hybrid 3420
Electric 2456
Diesel Hybrid 463
Diesel Plug-in Hybrid 89
Bi Fuel 57
Name: count, dtype: int64
Value counts for column condition:
condition
FAIRLY NEW 144242
OLD 95931
NEW 57335
VERY OLD 34594
Name: count, dtype: int64
Value counts for column usage:
usage
LOW 157594
AVERAGE 96913
HIGH 77595
Name: count, dtype: int64
Value counts for column manufacturer_popularity:
manufacturer_popularity
Very High 147756
Medium 132522
Low 33543
High 18281
Name: count, dtype: int64
We are going to use two different categorical encoders based on the characteristics and number of samples in the object columns
The target encoder will be used for features that have high volume of different variables in their column (e.g ['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage'])
The one-hot encoder is used on features that are binary in nature (e.g ['crossover_car_and_van'])
# Categorical features for OneHotEncoder
ohe_features = ['crossover_car_and_van']
categorical_features_1 = ohe_features
categorical_transformer_1 = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop='if_binary')),
]
).set_output(transform='pandas')
print(categorical_features_1)
categorical_transformer_1
['crossover_car_and_van']
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary', handle_unknown='ignore',
sparse_output=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary', handle_unknown='ignore',
sparse_output=False))])SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
X_train.dtypes
mileage float64 standard_colour object standard_make object standard_model object year_of_registration int64 body_type object crossover_car_and_van object fuel_type object condition object usage object manufacturer_popularity object dtype: object
categorical_transformer_1.fit_transform(X_train[categorical_features_1]).head()
| crossover_car_and_van_True | |
|---|---|
| 64508 | 0.0 |
| 41887 | 0.0 |
| 219334 | 0.0 |
| 130303 | 0.0 |
| 358615 | 0.0 |
ohe_end = categorical_transformer_1.fit_transform(X_train[categorical_features_1])
ohe_end.dtypes
crossover_car_and_van_True float64 dtype: object
target_features = ['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage',
]
categorical_features_2 = target_features
categorical_transformer_2 = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("targetencoder", TargetEncoder()),
]
).set_output(transform='pandas')
print(categorical_features_2)
categorical_transformer_2
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('targetencoder', TargetEncoder())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
('targetencoder', TargetEncoder())])SimpleImputer(strategy='most_frequent')
TargetEncoder()
categorical_transformer_2.fit_transform(X_train[categorical_features_2], y_train).head()
| standard_colour | standard_make | standard_model | body_type | fuel_type | manufacturer_popularity | condition | usage | |
|---|---|---|---|---|---|---|---|---|
| 64508 | 11221.693920 | 10863.540611 | 9674.671451 | 16599.703672 | 11514.661186 | 10863.540611 | 9319.719073 | 11842.677944 |
| 41887 | 13629.701917 | 16729.304030 | 14949.680548 | 16267.509347 | 21724.289227 | 14226.184950 | 14381.218060 | 11842.677944 |
| 219334 | 13525.497686 | 10863.540611 | 14835.157468 | 9833.849879 | 19070.145845 | 10863.540611 | 9319.719073 | 16303.797971 |
| 130303 | 12293.497992 | 8721.801624 | 9659.911765 | 9833.849879 | 11514.661186 | 13865.595693 | 14381.218060 | 16303.797971 |
| 358615 | 14480.141802 | 16729.304030 | 12908.835002 | 9833.849879 | 14119.975667 | 14226.184950 | 9319.719073 | 7602.887907 |
trg_end = categorical_transformer_2.fit_transform(X_train[categorical_features_2], y_train)
trg_end.dtypes
standard_colour float64 standard_make float64 standard_model float64 body_type float64 fuel_type float64 manufacturer_popularity float64 condition float64 usage float64 dtype: object
numeric_features = X_train.select_dtypes(exclude=['bool', 'object']).columns.tolist()
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="mean")),
]
).set_output(transform='pandas')
print(numeric_features)
numeric_transformer
['mileage', 'year_of_registration']
Pipeline(steps=[('imputer', SimpleImputer())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('imputer', SimpleImputer())])SimpleImputer()
numeric_transformer.fit_transform(X_train[numeric_features]).head()
| mileage | year_of_registration | |
|---|---|---|
| 64508 | 46600.0 | 2012.0 |
| 41887 | 36617.0 | 2016.0 |
| 219334 | 14685.0 | 2014.0 |
| 130303 | 21207.0 | 2017.0 |
| 358615 | 82078.0 | 2014.0 |
combined_features = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat_1", categorical_transformer_1, categorical_features_1),
("cat_2", categorical_transformer_2, categorical_features_2),
],
remainder='passthrough',
verbose_feature_names_out=False
).set_output(transform="pandas")
(e.g., derive features based on domain knowledge; produce polynomial/basis functions and interaction features).
The following data transformation have been performed on the dataset columns:
polynomial_features = Pipeline(
steps=[
("poly_int", PolynomialFeatures(degree=2, include_bias=False)),
("scaler", StandardScaler())
]
).set_output(transform='pandas')
polynomial_features
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
transformer_features = Pipeline(
steps=[
("preprocessor",combined_features),
("poly_int", polynomial_features)
]
).set_output(transform='pandas')
transformer_features
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
In this section, I applied polynomial/basis functions and interaction features to enhance the predictive power of the dataset. The following steps were performed:
Polynomial Features: I used the PolynomialFeatures transformer with a degree of 2 to generate polynomial features based on the existing features. This included interactions between features, excluding the bias term. To ensure the data's consistency, I applied standard scaling using the StandardScaler.
Scaling the Data: To bring all features to a comparable scale, I employed the StandardScaler. This transformation standardized the numerical features, ensuring that they have zero mean and unit variance.
To implement these transformations, I used pipelines. The "polynomial_features" pipeline consisted of the PolynomialFeatures and StandardScaler transformers. The "transformer_features" pipeline incorporated the "preprocessor" pipeline and the "polynomial_features" pipeline.
For preprocessing the numeric and categorical features, I employed the "preprocessor" pipeline. It comprised the "transformer_features" pipeline, which combined the polynomial/basis functions and interaction features with the original features. Additionally, I used SimpleImputer for handling missing values, OneHotEncoder for encoding categorical features, and TargetEncoder for encoding certain categorical features based on the target variable. The "remainder" parameter allowed the remaining columns to be passed through without any transformations.
By fitting the preprocessor pipeline to the training data (X_train) and the target variable (y_train), I transformed the data. The resulting transformed dataset displayed 77 columns, encompassing the original features, polynomial features, interaction features, and scaled values. The transformed training data had a shape of (249076, 77), indicating that each instance had 77 features.
These preprocessing steps were crucial in preparing the data for subsequent modeling and analysis, ensuring that the features were appropriately scaled and enhanced with polynomial and interaction terms.
preprocessor = Pipeline(
steps=[
("preprocessor",transformer_features),
]
).set_output(transform='pandas')
preprocessor
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
preprocessor.fit_transform(X_train, y_train).head()
| mileage | year_of_registration | crossover_car_and_van_True | standard_colour | standard_make | standard_model | body_type | fuel_type | manufacturer_popularity | condition | ... | fuel_type^2 | fuel_type manufacturer_popularity | fuel_type condition | fuel_type usage | manufacturer_popularity^2 | manufacturer_popularity condition | manufacturer_popularity usage | condition^2 | condition usage | usage^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64508 | 0.256634 | -1.045970 | -0.058518 | -1.418811 | -0.543949 | -0.614341 | 1.168141 | -0.772114 | -1.585367 | -0.789974 | ... | -0.698201 | -1.411812 | -0.923334 | -0.614820 | -1.524947 | -1.074474 | -0.814819 | -0.836446 | -0.722348 | -0.465741 |
| 41887 | -0.078984 | 0.160170 | -0.058518 | 0.542288 | 0.976669 | 0.369986 | 1.061425 | 4.667905 | 0.952895 | 0.307270 | ... | 5.378225 | 4.616471 | 2.181315 | 1.791074 | 0.958826 | 0.589966 | 0.015218 | 0.142671 | -0.093102 | -0.465741 |
| 219334 | -0.816316 | -0.442900 | -0.058518 | 0.457424 | -0.543949 | 0.348615 | -1.005366 | 3.253692 | -1.585367 | -0.789974 | ... | 3.439500 | 1.277834 | 0.142512 | 2.858461 | -1.524947 | -1.074474 | 0.195322 | -0.836446 | -0.285895 | 0.989014 |
| 130303 | -0.597053 | 0.461705 | -0.058518 | -0.545926 | -1.099165 | -0.617095 | -1.005366 | -0.772114 | 0.680708 | 0.307270 | ... | -0.698201 | -0.279069 | -0.041149 | 0.407323 | 0.660613 | 0.506446 | 1.215496 | 0.142671 | 0.580388 | 0.989014 |
| 358615 | 1.449369 | -0.442900 | -0.058518 | 1.234893 | 0.976669 | -0.010840 | -1.005366 | 0.616082 | 0.952895 | -0.789974 | ... | 0.497653 | 1.071524 | -0.555804 | -1.192107 | 0.958826 | -0.569739 | -1.241968 | -0.836446 | -1.137148 | -1.420925 |
5 rows × 77 columns
X_train_pp = preprocessor.fit_transform(X_train, y_train)
X_train_pp.shape
(249076, 77)
(e.g., perform manual selection guided by domain knowledge and exploratory data analysis, run automated selection algorithms for most useful predictors).
import phik
from phik import resources, report
train_data = pd.concat([X_train_pp, y_train], axis=1)
print(train_data.isnull().sum().sum())
train_data.head()
0
| mileage | year_of_registration | crossover_car_and_van_True | standard_colour | standard_make | standard_model | body_type | fuel_type | manufacturer_popularity | condition | ... | fuel_type manufacturer_popularity | fuel_type condition | fuel_type usage | manufacturer_popularity^2 | manufacturer_popularity condition | manufacturer_popularity usage | condition^2 | condition usage | usage^2 | price | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64508 | 0.256634 | -1.045970 | -0.058518 | -1.418811 | -0.543949 | -0.614341 | 1.168141 | -0.772114 | -1.585367 | -0.789974 | ... | -1.411812 | -0.923334 | -0.614820 | -1.524947 | -1.074474 | -0.814819 | -0.836446 | -0.722348 | -0.465741 | 6000 |
| 41887 | -0.078984 | 0.160170 | -0.058518 | 0.542288 | 0.976669 | 0.369986 | 1.061425 | 4.667905 | 0.952895 | 0.307270 | ... | 4.616471 | 2.181315 | 1.791074 | 0.958826 | 0.589966 | 0.015218 | 0.142671 | -0.093102 | -0.465741 | 14450 |
| 219334 | -0.816316 | -0.442900 | -0.058518 | 0.457424 | -0.543949 | 0.348615 | -1.005366 | 3.253692 | -1.585367 | -0.789974 | ... | 1.277834 | 0.142512 | 2.858461 | -1.524947 | -1.074474 | 0.195322 | -0.836446 | -0.285895 | 0.989014 | 8562 |
| 130303 | -0.597053 | 0.461705 | -0.058518 | -0.545926 | -1.099165 | -0.617095 | -1.005366 | -0.772114 | 0.680708 | 0.307270 | ... | -0.279069 | -0.041149 | 0.407323 | 0.660613 | 0.506446 | 1.215496 | 0.142671 | 0.580388 | 0.989014 | 8390 |
| 358615 | 1.449369 | -0.442900 | -0.058518 | 1.234893 | 0.976669 | -0.010840 | -1.005366 | 0.616082 | 0.952895 | -0.789974 | ... | 1.071524 | -0.555804 | -1.192107 | 0.958826 | -0.569739 | -1.241968 | -0.836446 | -1.137148 | -1.420925 | 9500 |
5 rows × 78 columns
# Compute the phi_k matrix for the entire DataFrame
phi_k_matrix = train_data.phik_matrix()
# Subset the phi_k matrix for the 'price' column
price_phi_k = phi_k_matrix['price']
# Create a DataFrame from the phi_k matrix results
df_phi_k = pd.DataFrame({'Feature': price_phi_k.index, 'Phi_K': price_phi_k.values})
# Sort the DataFrame by Phi_K in descending order
rank_features = df_phi_k.sort_values(by='Phi_K', ascending=False)
interval columns not set, guessing: ['mileage', 'year_of_registration', 'crossover_car_and_van_True', 'standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage', 'mileage^2', 'mileage year_of_registration', 'mileage crossover_car_and_van_True', 'mileage standard_colour', 'mileage standard_make', 'mileage standard_model', 'mileage body_type', 'mileage fuel_type', 'mileage manufacturer_popularity', 'mileage condition', 'mileage usage', 'year_of_registration^2', 'year_of_registration crossover_car_and_van_True', 'year_of_registration standard_colour', 'year_of_registration standard_make', 'year_of_registration standard_model', 'year_of_registration body_type', 'year_of_registration fuel_type', 'year_of_registration manufacturer_popularity', 'year_of_registration condition', 'year_of_registration usage', 'crossover_car_and_van_True^2', 'crossover_car_and_van_True standard_colour', 'crossover_car_and_van_True standard_make', 'crossover_car_and_van_True standard_model', 'crossover_car_and_van_True body_type', 'crossover_car_and_van_True fuel_type', 'crossover_car_and_van_True manufacturer_popularity', 'crossover_car_and_van_True condition', 'crossover_car_and_van_True usage', 'standard_colour^2', 'standard_colour standard_make', 'standard_colour standard_model', 'standard_colour body_type', 'standard_colour fuel_type', 'standard_colour manufacturer_popularity', 'standard_colour condition', 'standard_colour usage', 'standard_make^2', 'standard_make standard_model', 'standard_make body_type', 'standard_make fuel_type', 'standard_make manufacturer_popularity', 'standard_make condition', 'standard_make usage', 'standard_model^2', 'standard_model body_type', 'standard_model fuel_type', 'standard_model manufacturer_popularity', 'standard_model condition', 'standard_model usage', 'body_type^2', 'body_type fuel_type', 'body_type manufacturer_popularity', 'body_type condition', 'body_type usage', 'fuel_type^2', 'fuel_type manufacturer_popularity', 'fuel_type condition', 'fuel_type usage', 'manufacturer_popularity^2', 'manufacturer_popularity condition', 'manufacturer_popularity usage', 'condition^2', 'condition usage', 'usage^2', 'price']
Weak Correlation (0 to 0.3): If the Phi_K value between a feature and the target variable (price) falls within this range, it indicates a weak correlation. The feature may have minimal influence on predicting the price and might not be a strong predictor on its own.
Moderate Correlation (0.3 to 0.59): If the Phi_K value between a feature and the target variable falls within this range, it suggests a moderate correlation. The feature can provide some information in predicting the price, but its impact may not be as strong as features with higher Phi_K values.
Strong Correlation (0.6 to 1): If the Phi_K value between a feature and the target variable falls within this range, it indicates a strong correlation. The feature has a substantial influence on predicting the price and can be considered a highly valuable predictor.
# Set the threshold
threshold = 0.60
# Filter the DataFrame to include values above the threshold
top_features = rank_features[rank_features['Phi_K'] > threshold]
# Count the number of values above the threshold
count_top_features = len(top_features)
print("Number of top features:", count_top_features)
# Print the values above the threshold
print(top_features)
Number of top features: 23
Feature Phi_K
77 price 1.000000
60 standard_model condition 0.839754
61 standard_model usage 0.823988
65 body_type condition 0.754401
26 year_of_registration standard_model 0.739892
55 standard_make usage 0.738724
5 standard_model 0.738207
43 standard_colour standard_model 0.725146
69 fuel_type condition 0.720628
59 standard_model manufacturer_popularity 0.718567
1 year_of_registration 0.718324
22 year_of_registration^2 0.718208
47 standard_colour condition 0.702730
56 standard_model^2 0.699592
57 standard_model body_type 0.695036
9 condition 0.686940
74 condition^2 0.686940
30 year_of_registration condition 0.686940
58 standard_model fuel_type 0.679834
50 standard_make standard_model 0.674426
54 standard_make condition 0.672927
66 body_type usage 0.670602
72 manufacturer_popularity condition 0.609153
# list the top 21 rows rows of the sorted DataFrame
print(rank_features.dtypes)
rank_features.head(30)
Feature object Phi_K float64 dtype: object
| Feature | Phi_K | |
|---|---|---|
| 77 | price | 1.000000 |
| 60 | standard_model condition | 0.839754 |
| 61 | standard_model usage | 0.823988 |
| 65 | body_type condition | 0.754401 |
| 26 | year_of_registration standard_model | 0.739892 |
| 55 | standard_make usage | 0.738724 |
| 5 | standard_model | 0.738207 |
| 43 | standard_colour standard_model | 0.725146 |
| 69 | fuel_type condition | 0.720628 |
| 59 | standard_model manufacturer_popularity | 0.718567 |
| 1 | year_of_registration | 0.718324 |
| 22 | year_of_registration^2 | 0.718208 |
| 47 | standard_colour condition | 0.702730 |
| 56 | standard_model^2 | 0.699592 |
| 57 | standard_model body_type | 0.695036 |
| 9 | condition | 0.686940 |
| 74 | condition^2 | 0.686940 |
| 30 | year_of_registration condition | 0.686940 |
| 58 | standard_model fuel_type | 0.679834 |
| 50 | standard_make standard_model | 0.674426 |
| 54 | standard_make condition | 0.672927 |
| 66 | body_type usage | 0.670602 |
| 72 | manufacturer_popularity condition | 0.609153 |
| 75 | condition usage | 0.585598 |
| 51 | standard_make body_type | 0.582077 |
| 0 | mileage | 0.575386 |
| 12 | mileage year_of_registration | 0.574453 |
| 21 | mileage usage | 0.560258 |
| 14 | mileage standard_colour | 0.547563 |
| 76 | usage^2 | 0.547109 |
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x="Phi_K", y="Feature", data=rank_features.head(26))
# Save the plot as an image file (PNG)
plt.savefig('phi_k_top_20_features.png')
# Compute the significance_matrix for the entire DataFrame
sgf_matrix = train_data.significance_matrix()
# Subset the significance_matrix for the 'price' column
sgf_matrix = sgf_matrix['price']
# Create a DataFrame from the significance_matrix results
df_sgf = pd.DataFrame({'Feature': sgf_matrix.index, 'sgf': sgf_matrix.values})
# Sort the DataFrame by Phi_K in descending order
sgf_rank_features = df_sgf.sort_values(by='sgf', ascending=False)
interval columns not set, guessing: ['mileage', 'year_of_registration', 'crossover_car_and_van_True', 'standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage', 'mileage^2', 'mileage year_of_registration', 'mileage crossover_car_and_van_True', 'mileage standard_colour', 'mileage standard_make', 'mileage standard_model', 'mileage body_type', 'mileage fuel_type', 'mileage manufacturer_popularity', 'mileage condition', 'mileage usage', 'year_of_registration^2', 'year_of_registration crossover_car_and_van_True', 'year_of_registration standard_colour', 'year_of_registration standard_make', 'year_of_registration standard_model', 'year_of_registration body_type', 'year_of_registration fuel_type', 'year_of_registration manufacturer_popularity', 'year_of_registration condition', 'year_of_registration usage', 'crossover_car_and_van_True^2', 'crossover_car_and_van_True standard_colour', 'crossover_car_and_van_True standard_make', 'crossover_car_and_van_True standard_model', 'crossover_car_and_van_True body_type', 'crossover_car_and_van_True fuel_type', 'crossover_car_and_van_True manufacturer_popularity', 'crossover_car_and_van_True condition', 'crossover_car_and_van_True usage', 'standard_colour^2', 'standard_colour standard_make', 'standard_colour standard_model', 'standard_colour body_type', 'standard_colour fuel_type', 'standard_colour manufacturer_popularity', 'standard_colour condition', 'standard_colour usage', 'standard_make^2', 'standard_make standard_model', 'standard_make body_type', 'standard_make fuel_type', 'standard_make manufacturer_popularity', 'standard_make condition', 'standard_make usage', 'standard_model^2', 'standard_model body_type', 'standard_model fuel_type', 'standard_model manufacturer_popularity', 'standard_model condition', 'standard_model usage', 'body_type^2', 'body_type fuel_type', 'body_type manufacturer_popularity', 'body_type condition', 'body_type usage', 'fuel_type^2', 'fuel_type manufacturer_popularity', 'fuel_type condition', 'fuel_type usage', 'manufacturer_popularity^2', 'manufacturer_popularity condition', 'manufacturer_popularity usage', 'condition^2', 'condition usage', 'usage^2', 'price']
# select the first 20 rows of the sorted DataFrame
sgf_rank_features.head(26)
| Feature | sgf | |
|---|---|---|
| 77 | price | 1016.381868 |
| 60 | standard_model condition | 579.039930 |
| 61 | standard_model usage | 549.069147 |
| 54 | standard_make condition | 514.893087 |
| 65 | body_type condition | 456.585673 |
| 55 | standard_make usage | 442.139642 |
| 26 | year_of_registration standard_model | 428.840189 |
| 5 | standard_model | 426.857074 |
| 43 | standard_colour standard_model | 417.215330 |
| 72 | manufacturer_popularity condition | 413.810062 |
| 22 | year_of_registration^2 | 411.574159 |
| 1 | year_of_registration | 411.324879 |
| 59 | standard_model manufacturer_popularity | 410.719980 |
| 69 | fuel_type condition | 405.566240 |
| 56 | standard_model^2 | 398.986078 |
| 47 | standard_colour condition | 396.133207 |
| 57 | standard_model body_type | 394.834370 |
| 75 | condition usage | 393.651928 |
| 9 | condition | 392.239537 |
| 74 | condition^2 | 392.237887 |
| 30 | year_of_registration condition | 392.236225 |
| 58 | standard_model fuel_type | 387.571005 |
| 66 | body_type usage | 382.241678 |
| 50 | standard_make standard_model | 381.810713 |
| 70 | fuel_type usage | 336.020237 |
| 51 | standard_make body_type | 314.566110 |
# Save the current style settings
original_style = sns.axes_style()
# Set the desired style for the plot
sns.set_style('whitegrid')
# Perform the correlation calculation and create the heatmap
price_corr_with_feat = train_data.corr()['price'].sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(20, 40))
price_corr = train_data.corrwith(train_data['price'])
sns.heatmap(price_corr_with_feat.to_frame(), cmap='YlGnBu', annot=True, ax=ax)
ax.set_title('Correlation of Price with features')
plt.tight_layout()
# Show the plot
plt.show()
# Save the current style settings
original_style = sns.axes_style()
# Set the desired style for the plot
sns.set_style('whitegrid')
# Perform the correlation calculation and create the heatmap
top_30_features = price_corr_with_feat.head(30)
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x=top_30_features.values, y=top_30_features.index, ax=ax)
ax.set_xlabel('Correlation with Price')
ax.set_ylabel('Features')
ax.set_title('Top 30 Features Correlation with Price')
plt.tight_layout()
# Revert back to the original style settings
sns.set_style(original_style)
# Show the plot
plt.show()
from sklearn.feature_selection import SelectKBest, f_regression
# could create a pipeline simply to get a pandas DataFrame as output
selector = make_pipeline(
SelectKBest(f_regression, k=22)
).set_output(transform='pandas').fit(X_train_pp, y_train)
X_sel = selector.transform(X_train_pp)
selector.get_feature_names_out()
array(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage',
'standard_model^2', 'standard_model body_type',
'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage',
'body_type condition', 'body_type usage', 'fuel_type condition',
'manufacturer_popularity condition', 'condition^2'], dtype=object)
X_sel.head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64508 | -1.045970 | -0.614341 | -0.789974 | -1.046677 | -0.617331 | -0.791920 | -0.821167 | -1.010474 | -0.652337 | -0.860563 | ... | -0.166906 | -0.727193 | -0.853031 | -0.807101 | -0.627630 | -0.191427 | 0.463986 | -0.923334 | -1.074474 | -0.836446 |
| 41887 | 0.160170 | 0.369986 | 0.307270 | 0.159488 | 0.370124 | 0.305397 | 0.471668 | 0.431482 | 0.565714 | 0.941073 | ... | 0.644887 | 1.855644 | 0.564783 | 0.409848 | 0.091803 | 0.843717 | 0.400347 | 2.181315 | 0.589966 | 0.142671 |
| 219334 | -0.442900 | 0.348615 | -0.789974 | -0.443894 | 0.346018 | -0.789923 | 0.429474 | -0.671850 | -0.181392 | -0.860563 | ... | -0.311480 | 1.348900 | -0.114163 | -0.338235 | 0.838356 | -1.015130 | -0.122504 | 0.142512 | -1.074474 | -0.836446 |
| 130303 | 0.461705 | -0.617095 | 0.307270 | 0.461404 | -0.615609 | 0.306938 | -0.682804 | 0.128415 | -0.827483 | -0.547863 | ... | -0.811669 | -0.729250 | -0.472939 | -0.331783 | -0.133354 | -0.364928 | -0.122504 | -0.041149 | 0.506446 | 0.142671 |
| 358615 | -0.442900 | -0.010840 | -0.789974 | -0.443894 | -0.012915 | -0.789923 | 0.242990 | -0.531532 | 0.278903 | -0.153741 | ... | -0.497660 | 0.130706 | 0.182131 | -0.513255 | -0.816840 | -1.015130 | -1.506614 | -0.555804 | -0.569739 | -0.836446 |
5 rows × 22 columns
Determining a Reasonable Threshold for Number of Principal Components
X_sel.head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 64508 | -1.045970 | -0.614341 | -0.789974 | -1.046677 | -0.617331 | -0.791920 | -0.821167 | -1.010474 | -0.652337 | -0.860563 | ... | -0.166906 | -0.727193 | -0.853031 | -0.807101 | -0.627630 | -0.191427 | 0.463986 | -0.923334 | -1.074474 | -0.836446 |
| 41887 | 0.160170 | 0.369986 | 0.307270 | 0.159488 | 0.370124 | 0.305397 | 0.471668 | 0.431482 | 0.565714 | 0.941073 | ... | 0.644887 | 1.855644 | 0.564783 | 0.409848 | 0.091803 | 0.843717 | 0.400347 | 2.181315 | 0.589966 | 0.142671 |
| 219334 | -0.442900 | 0.348615 | -0.789974 | -0.443894 | 0.346018 | -0.789923 | 0.429474 | -0.671850 | -0.181392 | -0.860563 | ... | -0.311480 | 1.348900 | -0.114163 | -0.338235 | 0.838356 | -1.015130 | -0.122504 | 0.142512 | -1.074474 | -0.836446 |
| 130303 | 0.461705 | -0.617095 | 0.307270 | 0.461404 | -0.615609 | 0.306938 | -0.682804 | 0.128415 | -0.827483 | -0.547863 | ... | -0.811669 | -0.729250 | -0.472939 | -0.331783 | -0.133354 | -0.364928 | -0.122504 | -0.041149 | 0.506446 | 0.142671 |
| 358615 | -0.442900 | -0.010840 | -0.789974 | -0.443894 | -0.012915 | -0.789923 | 0.242990 | -0.531532 | 0.278903 | -0.153741 | ... | -0.497660 | 0.130706 | 0.182131 | -0.513255 | -0.816840 | -1.015130 | -1.506614 | -0.555804 | -0.569739 | -0.836446 |
5 rows × 22 columns
pca_full = PCA()
pca_full.fit(X_sel)
PCA()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
PCA()
pca_full.explained_variance_ratio_
array([6.01858376e-01, 2.77943585e-01, 3.68672080e-02, 3.22543008e-02,
1.39796031e-02, 1.06140749e-02, 9.32795672e-03, 4.89843562e-03,
4.67722146e-03, 2.85780390e-03, 1.79078924e-03, 6.22243901e-04,
5.92991866e-04, 4.20822093e-04, 3.67009366e-04, 3.46739710e-04,
2.62375314e-04, 1.99448509e-04, 1.18992795e-04, 1.31721311e-08,
8.38749132e-09, 2.05577422e-10])
sns.set(style="whitegrid")
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
plt.savefig("plot.png")
# keep the first 11 principal components of the data
pca = PCA(n_components=13)
# fit PCA model to the train data
pca.fit(X_sel)
# transform data onto the first two principal components
X_pca = pca.transform(X_sel)
print("Original shape: {}".format(str(X_sel.shape)))
print("Reduced shape: {}".format(str(X_pca.shape)))
Original shape: (249076, 22) Reduced shape: (249076, 13)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(
style='ticks',
context='talk',
font_scale=0.8,
rc={'figure.figsize': (8,6)}
)
n_components = len(pca.components_)
plt.matshow(np.abs(pca.components_), cmap='coolwarm')
plt.yticks(range(n_components), [f"Component {i}" for i in range(1, n_components+1)])
plt.colorbar()
plt.xticks(range(len(X_sel.columns)),
X_sel.columns, rotation=60, ha='left')
plt.xlabel("Feature")
plt.ylabel("Principal components");
In general, when plotting the principal components, the purpose is to visualize how much of the variance in the data is captured by each principal component, and how the samples in the dataset cluster or separate from each other in the principal component space.
If the samples are highly concentrated together in the principal component space, this could indicate that there is not much variability in the data, or that the features do not provide much discriminatory power. On the other hand, if the samples are well-separated in the principal component space, this suggests that the features are providing useful information for distinguishing between different samples.
The color of the points can represent the target variable (in this case, price), so areas with high concentrations of yellow points could indicate regions of the principal component space where the prices tend to be higher. Conversely, areas with more red points and fewer yellow points could indicate regions where prices tend to be lower.
The Influence of the Features in the Components
The scatter plot that we've generated shows the relationship between each feature and the first principal component. Each point in the plot represents a sample in our data set, with the x-coordinate representing the value of the feature for that sample, and the y-coordinate representing the score of the sample on the first principal component.
The slope and direction of each scatter plot indicate the correlation between the feature and the first principal component. A positive slope indicates a positive correlation between the feature and the first principal component, meaning that as the value of the feature increases, the score on the first principal component tends to increase as well. Similarly, a negative slope indicates a negative correlation between the feature and the first principal component, meaning that as the value of the feature increases, the score on the first principal component tends to decrease.
By examining the scatter plot, we can identify the features that are most strongly correlated with the first principal component, and thus have the largest impact on the variation captured by the first principal component. These features may be good candidates for further analysis or interpretation, depending on the goals of the analysis.
It's worth noting that in some cases, the relationship between a feature and a principal component may not be immediately apparent from the scatter plot. This may be due to non-linear relationships between the features and the principal components, or to interactions between different features. In such cases, additional analysis or visualization techniques may be necessary to fully understand the relationships between the features and the principal components.
def plot_principal_component(pca, X_sel, pc_index):
# Get the principal component
pca_component = pca.components_[pc_index]
# Create a dataframe of the feature values and principal component scores
df = X_sel.copy()
df[f"PC{pc_index + 1}"] = pca.transform(X_sel)[:, pc_index]
# Calculate the correlation between the features and the principal component
corr = df.corr().loc[:, f"PC{pc_index + 1}"]
# Create a barplot to visualize the correlation values
sns.barplot(x=corr.values, y=corr.index)
# Set the axis labels and title
plt.xlabel(f"Correlation with Principal Component {pc_index + 1}")
plt.ylabel("Feature")
plt.title(f"Correlation between Features and Principal Component {pc_index + 1}")
# Show the plot
plt.show()
def plot_correlation_heatmap(pca, X_sel, pc_index,):
# Get the principal component
pca_component = pca.components_[pc_index]
# Create a dataframe of the feature values and principal component scores
df = X_sel.copy()
df[f"PC{pc_index + 1}"] = pca.transform(X_sel)[:, pc_index]
# Calculate the correlation between the features and the principal component
corr = df.corr().loc[:, f"PC{pc_index + 1}"]
# Create a heatmap to visualize the correlation values
sns.heatmap(corr.to_frame().T, cmap="Greys_r", annot=True, fmt=".2f", cbar=False,
annot_kws={"fontsize": 10, "fontweight": "bold", "fontfamily": "serif"},
xticklabels=True, yticklabels=True)
# Stretch the heatmap
plt.gcf().set_size_inches(16, 4)
# Set the axis labels and title
plt.xlabel("")
plt.ylabel(f"Correlation with Principal Component {pc_index + 1}", fontsize=10, fontweight="bold", fontfamily="serif")
plt.title(f"Correlation between Features and Principal Component {pc_index + 1}", fontsize=10, fontweight="bold", fontfamily="serif")
# Show the plot
plt.show()
# plotting the prominent features of principal component 1
plot_principal_component(pca, X_sel, 0)
# plotting the prominent features of principal component 1 with heatmap
plot_correlation_heatmap(pca, X_sel, 0)
# plotting the prominent features of principal component 4
plot_principal_component(pca, X_sel, 3)
# plotting the prominent features of principal component 4 with heatmap
plot_correlation_heatmap(pca, X_sel, 3)
# plotting the prominent features of principal component 8
plot_principal_component(pca, X_sel, 7)
# plotting the prominent features of principal component 8 with heatmap
plot_correlation_heatmap(pca, X_sel, 7)
# plotting the prominent features of principal component 13
plot_principal_component(pca, X_sel, 12)
# plotting the prominent features of principal component 13 with heatmap
plot_correlation_heatmap(pca, X_sel, 12)
This code is used to calculate and visualize the correlation between the features of a dataset and its first principal component (PC1) using a heatmap.
First, the code gets the first principal component of the dataset using PCA. Then, it creates a copy of the preprocessed dataset and adds a new column containing the scores of the first principal component for each data point.
The code then calculates the correlation between each feature and the first principal component, and creates a heatmap to visualize these correlation values. The heatmap is stretched to make it easier to read and annotations are added to display the correlation values.
The resulting heatmap allows you to quickly identify which features have the highest and lowest correlation with the first principal component. This information can be useful in understanding how the features of the dataset are related and which features are most important for explaining the variation in the data.
pca.components_
array([[ 2.01182192e-01, 2.11032559e-01, 2.08928269e-01,
2.01222179e-01, 2.11662471e-01, 2.08941915e-01,
2.10016542e-01, 2.08900984e-01, 1.85847797e-01,
2.44748326e-01, 2.04284723e-01, 2.01391085e-01,
2.01175244e-01, 2.00707476e-01, 2.03745126e-01,
2.65938387e-01, 2.46715890e-01, 2.32147908e-01,
1.92746377e-01, 2.15277417e-01, 2.13627533e-01,
2.02557721e-01],
[-2.40992073e-01, 2.53363741e-01, -2.57098395e-01,
-2.41050535e-01, 2.52257079e-01, -2.57112946e-01,
2.42689268e-01, -2.40215921e-01, 2.73650696e-01,
-5.06450912e-02, 3.38849704e-02, 2.58554277e-01,
2.42900030e-01, 2.41872703e-01, 2.53682380e-01,
2.37135230e-02, 9.67080187e-02, -1.22332640e-01,
-5.48161386e-02, -1.93787572e-01, -2.25746979e-01,
-2.51945092e-01],
[ 9.88342981e-03, 7.22043703e-02, 9.00115523e-02,
9.88017746e-03, 7.19785206e-02, 8.97785653e-02,
8.70771978e-02, 1.05414364e-01, 1.21384203e-01,
1.59202538e-01, -2.73119044e-01, 7.88530970e-02,
-1.56229800e-01, 1.47202867e-01, 1.15095897e-01,
7.32678538e-02, -2.50030374e-01, -2.46998135e-01,
-7.62196711e-01, 1.79207640e-01, 1.47762362e-01,
1.02733455e-01],
[ 1.42288199e-02, -7.42616179e-02, -3.89259534e-02,
1.41804802e-02, -7.42020383e-02, -3.88608904e-02,
-7.58880949e-02, -4.43861000e-02, 2.12143969e-01,
3.75747289e-01, 7.23750152e-01, -7.83352160e-02,
-2.69830044e-01, -1.23156610e-01, 6.69024968e-03,
-8.31750817e-02, 1.23755933e-01, -3.45235779e-01,
-1.20151429e-01, -1.08734399e-01, 5.21960768e-02,
-5.70396595e-02],
[-5.49076181e-01, -7.36192924e-02, 5.89775122e-02,
-5.48166572e-01, -7.49418386e-02, 5.84629224e-02,
-7.05924571e-02, 6.90291865e-02, 1.08789009e-01,
3.20036235e-01, 4.15267605e-02, -7.05070419e-02,
1.08492604e-01, -1.04118921e-01, -3.69166100e-02,
9.66739172e-02, -1.82628097e-01, 3.15061759e-01,
3.07209295e-02, 2.52321460e-02, 1.07280985e-01,
2.65268697e-01],
[-8.05653799e-02, -6.85979312e-02, -7.40952752e-02,
-8.06114834e-02, -6.85463255e-02, -7.42245497e-02,
-1.27017979e-01, -1.39549300e-01, -6.14690260e-02,
1.11592700e-02, 1.27520369e-01, -1.37925414e-01,
-6.53844734e-02, 5.51762366e-01, -1.64373940e-01,
-4.57671352e-02, 2.01124408e-02, -3.82965933e-02,
8.13236721e-02, 7.08895021e-01, -1.88927501e-01,
-8.52715611e-02],
[-2.53867414e-01, 3.91192505e-02, 7.99204126e-02,
-2.52982701e-01, 3.92074582e-02, 8.02835935e-02,
5.44520578e-02, 1.09505888e-01, -2.96786727e-01,
-3.45168462e-01, 7.24746229e-03, 1.57045950e-01,
-2.95619435e-01, -3.86008721e-02, 6.93058686e-03,
2.40959529e-01, 5.34451762e-01, -3.27278859e-01,
-3.36955730e-03, 3.85201369e-02, 5.02358743e-02,
2.54941815e-01],
[ 5.37977549e-02, -1.02196817e-02, 5.02653101e-02,
5.39416436e-02, -8.86066699e-03, 5.06072960e-02,
1.34536799e-01, 2.10504300e-01, 1.09367360e-01,
1.45496426e-01, 6.78962098e-03, 2.29345403e-01,
3.51841051e-02, -8.43884538e-02, -5.73377281e-01,
2.04685970e-01, 3.93705653e-02, 3.21743560e-02,
-1.19353473e-01, -6.40866591e-02, -6.57964076e-01,
8.86899227e-02],
[ 8.01294026e-02, 5.36681495e-03, 5.02768964e-02,
8.02028568e-02, 7.63966054e-03, 5.06279143e-02,
-5.63932304e-01, -6.32342509e-01, 2.12908230e-02,
1.25250206e-01, -9.42746460e-02, 2.66307344e-01,
3.57208188e-02, -6.31809389e-02, -3.73341809e-02,
3.41479142e-01, 1.36260586e-01, 8.49068666e-02,
-1.02835628e-01, -3.29429714e-02, 2.03903082e-02,
8.37961722e-02],
[-2.43627310e-02, 4.15497318e-03, 1.97133746e-01,
-2.36201630e-02, 7.61267244e-05, 1.97723539e-01,
-1.82449438e-01, -3.80431281e-02, 4.08811032e-01,
-3.50894232e-01, 7.56781277e-02, 3.22116544e-01,
1.36644820e-01, 1.60483387e-02, -8.07512151e-02,
-5.05763223e-01, -1.44946612e-01, -2.06481972e-01,
1.09066738e-01, 5.85865262e-02, 2.11697610e-03,
3.55214194e-01],
[-1.80545114e-02, -3.67753239e-01, -1.48363886e-01,
-1.88137304e-02, -3.64222950e-01, -1.49625999e-01,
-1.13402063e-02, 2.49938987e-01, 1.67689686e-01,
-1.73379912e-02, -7.43297942e-02, 5.54377691e-01,
1.14867001e-02, -6.49615341e-02, -4.94113655e-02,
1.45365040e-01, 1.27629197e-02, -2.17234150e-02,
7.32808176e-02, 1.51397714e-01, 2.97797625e-01,
-3.70954745e-01],
[-1.24793002e-01, 2.22699771e-01, 3.75476481e-01,
-1.27629011e-01, 2.18899129e-01, 3.66178910e-01,
-1.02045236e-01, 1.68463117e-02, 1.32541706e-01,
-2.82483574e-02, -6.09403218e-02, -3.17655631e-02,
-1.74348797e-01, -2.07424655e-01, -9.01028895e-02,
-1.26983026e-01, 1.25352230e-01, 1.14030270e-01,
-1.52732215e-02, 1.33748186e-01, -9.56306763e-03,
-6.45891456e-01],
[ 4.38878244e-02, -2.03732794e-03, -1.34876868e-01,
4.45522308e-02, -1.17546454e-03, -1.32266346e-01,
9.54430661e-02, -7.25960839e-02, 4.91044813e-01,
-1.50000561e-01, -1.35101152e-01, -6.97582998e-02,
-6.83652941e-01, -5.24413251e-02, 1.51325808e-01,
1.10786704e-01, -2.35032498e-02, 3.08327050e-01,
1.38731467e-01, 4.86980106e-02, -1.17191013e-01,
1.51656612e-01]])
pca.components_[0]
array([0.20118219, 0.21103256, 0.20892827, 0.20122218, 0.21166247,
0.20894192, 0.21001654, 0.20890098, 0.1858478 , 0.24474833,
0.20428472, 0.20139109, 0.20117524, 0.20070748, 0.20374513,
0.26593839, 0.24671589, 0.23214791, 0.19274638, 0.21527742,
0.21362753, 0.20255772])
pca.components_[12]
array([ 0.04388782, -0.00203733, -0.13487687, 0.04455223, -0.00117546,
-0.13226635, 0.09544307, -0.07259608, 0.49104481, -0.15000056,
-0.13510115, -0.0697583 , -0.68365294, -0.05244133, 0.15132581,
0.1107867 , -0.02350325, 0.30832705, 0.13873147, 0.04869801,
-0.11719101, 0.15165661])
c1_df = pd.Series(pca.components_[0], index=X_sel.columns)
c1_df.abs().sort_values(ascending=False)
standard_model condition 0.265938 standard_model usage 0.246716 standard_make condition 0.244748 body_type condition 0.232148 fuel_type condition 0.215277 manufacturer_popularity condition 0.213628 year_of_registration standard_model 0.211662 standard_model 0.211033 standard_colour standard_model 0.210017 year_of_registration condition 0.208942 condition 0.208928 standard_colour condition 0.208901 standard_make usage 0.204285 standard_model manufacturer_popularity 0.203745 condition^2 0.202558 standard_model^2 0.201391 year_of_registration^2 0.201222 year_of_registration 0.201182 standard_model body_type 0.201175 standard_model fuel_type 0.200707 body_type usage 0.192746 standard_make standard_model 0.185848 dtype: float64
c1_df14 = pd.Series(pca.components_[12], index=X_sel.columns)
c1_df14.abs().sort_values(ascending=False)
standard_model body_type 0.683653 standard_make standard_model 0.491045 body_type condition 0.308327 condition^2 0.151657 standard_model manufacturer_popularity 0.151326 standard_make condition 0.150001 body_type usage 0.138731 standard_make usage 0.135101 condition 0.134877 year_of_registration condition 0.132266 manufacturer_popularity condition 0.117191 standard_model condition 0.110787 standard_colour standard_model 0.095443 standard_colour condition 0.072596 standard_model^2 0.069758 standard_model fuel_type 0.052441 fuel_type condition 0.048698 year_of_registration^2 0.044552 year_of_registration 0.043888 standard_model usage 0.023503 standard_model 0.002037 year_of_registration standard_model 0.001175 dtype: float64
(e.g., choose suitable algorithm(s), fit and tune models; grid-search, rank, and select model(s) on based on evaluation metrics and under/overfit trade-off; build an ensemble with best performing models/configurations).
We will be taking the approach of building models with the full preprocessed pipelines and also consider a second approach building models based off feature selection and dimension reduction
FUNCTIONS FOR REGRESSION PIPELINES
# lets create a reusable function for our regression pipelines.
# the preprocessing pipe is constant.
# without feature selection and PCA
def create_regr_pipe(model):
""" """
regr_pipe = Pipeline(
steps=[
("pp", preprocessor),
("regr", model)
]
).set_output(transform="pandas")
return regr_pipe
# lets create a reusable function for our regression pipelines.
# the preprocessing pipe is constant.
# with feature selection
def create_regr_pipe_featsel(model):
""" """
regr_pipe = Pipeline(
steps=[
("pp", preprocessor),
("featsel", SelectKBest(f_regression, k=22)),
("regr", model)
]
).set_output(transform="pandas")
return regr_pipe
# lets create a reusable function for our regression pipelines.
# the preprocessing pipe is constant.
# with feature selection and PCA
def create_regr_pipe_pca(model):
""" """
regr_pipe = Pipeline(
steps=[
("pp", preprocessor),
("featsel", SelectKBest(f_regression, k=22)),
("pca", PCA(n_components=13)),
("regr", model)
]
).set_output(transform="pandas")
return regr_pipe
The linear model i will be building is the regularized Linear Regression model (Ridge)
lr = create_regr_pipe(Ridge())
# fitting the training data
lr.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
Ridge()
# Model performance evaluation
lr_r2_score = r2_score(y_test, lr.predict(X_test))
lr_rmse = rmse(y_test, lr.predict(X_test))
lr_mse = mean_squared_error(y_test, lr.predict(X_test))
lr_mae = mean_absolute_error(y_test, lr.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Linear regression(Ridge) model with all the preprocessed Features:')
print("r2 score: ", lr_r2_score)
print('Root Mean Squared Error:', lr_rmse)
print('Mean Squared Error:', lr_mse)
print('Mean Absolute Error:', lr_mae)
Evaluation metrics of Linear regression(Ridge) model with all the preprocessed Features: r2 score: 0.844428399347362 Root Mean Squared Error: 2963.3071977481527 Mean Squared Error: 8781189.548226008 Mean Absolute Error: 2081.5998659552256
lr_featsel = create_regr_pipe_featsel(Ridge())
# fitting the training data
lr_featsel.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', Ridge())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
Ridge()
lr_featsel['featsel'].get_feature_names_out()
array(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage',
'standard_model^2', 'standard_model body_type',
'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage',
'body_type condition', 'body_type usage', 'fuel_type condition',
'manufacturer_popularity condition', 'condition^2'], dtype=object)
# Model performance evaluation
lr_featsel_r2_score = r2_score(y_test, lr_featsel.predict(X_test))
lr_featsel_rmse = rmse(y_test, lr_featsel.predict(X_test))
lr_featsel_mse = mean_squared_error(y_test, lr_featsel.predict(X_test))
lr_featsel_mae = mean_absolute_error(y_test, lr_featsel.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Linear regression(Ridge) with Feature selection, k=25:')
print("r2 score: ", lr_featsel_r2_score)
print('Root Mean Squared Error:', lr_featsel_rmse)
print('Mean Squared Error:', lr_featsel_mse)
print('Mean Absolute Error:', lr_featsel_mae)
Evaluation metrics of Linear regression(Ridge) with Feature selection, k=25: r2 score: 0.8261606050186868 Root Mean Squared Error: 3132.460597107935 Mean Squared Error: 9812309.392433802 Mean Absolute Error: 2229.2523894155524
lr_pca = create_regr_pipe_pca(Ridge())
# fitting the training data
lr_pca.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)), ('regr', Ridge())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)), ('regr', Ridge())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
PCA(n_components=13)
Ridge()
lr_pca['pca'].get_feature_names_out()
array(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7',
'pca8', 'pca9', 'pca10', 'pca11', 'pca12'], dtype=object)
lr_pca['regr'].coef_
array([ 1837.53408568, 237.68374867, -267.71661618, 884.02027936,
-316.95788674, 353.35717622, -79.48664382, 528.89972389,
449.80125838, 305.3641455 , 1117.00006002, -4389.83966996,
4014.88632754])
# Model performance evaluation
lr_pca_r2_score = r2_score(y_test, lr_pca.predict(X_test))
lr_pca_rmse = rmse(y_test, lr_pca.predict(X_test))
lr_pca_mse = mean_squared_error(y_test, lr_pca.predict(X_test))
lr_pca_mae = mean_absolute_error(y_test, lr_pca.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Linear regression(Ridge) with Feature selection, k=25 and PCA, n_components=15:')
print("r2 score: ", lr_pca_r2_score)
print('Root Mean Squared Error:', lr_pca_rmse)
print('Mean Squared Error:', lr_pca_mse)
print('Mean Absolute Error:', lr_pca_mae)
Evaluation metrics of Linear regression(Ridge) with Feature selection, k=25 and PCA, n_components=15: r2 score: 0.816377212486792 Root Mean Squared Error: 3219.398992686099 Mean Squared Error: 10364529.874108268 Mean Absolute Error: 2300.785755608655
The three linear regression models have exactly the same evaluation metrics: the same score, and root mean squared error. This means that the feature selection and PCA did not have a significant impact on the performance of the model. It is possible that the original set of features was already informative enough to obtain a good prediction, or that the feature selection and PCA were not properly configured to improve the model. It is important to further investigate the feature selection and PCA methods used, and experiment with different configurations to see if they can improve the model performance.
In our linear regression model, we experimented with different combinations of feature selection and principal component analysis (PCA) to evaluate their impact on model performance. We report the results for five different combinations of feature selection and PCA.
For the first combination, we increased the number of components and features to n=k=10. This model achieved a score of 0.804 and an RMSE of 3327. This indicates that increasing the number of components and features beyond a certain point does not result in significant improvements in model performance.
For the second combination, we reduced the number of components and features to n=k=5. This resulted in a significant drop in model performance, with a score of 0.518 and an RMSE of 5216. This suggests that reducing the number of features and components too much can lead to a loss of important information.
For the third combination, we performed PCA with n=14 components and feature selection with k=14 features. This model achieved a score of 0.809, indicating that it explained about 81% of the variance in the target variable. The root mean squared error (RMSE) for this model was 3283, which means that the average difference between the predicted and actual values of the target variable was 3283. Overall, this model performed similarly to the linear regression model without feature selection and PCA.
For the fourth combination, we performed PCA with n=13 components and feature selection with k=13 features. This model achieved a score of 0.809 and an RMSE of 3287, which is similar to the first combination. This suggests that selecting a slightly different combination of features and components can lead to similar model performance.
For the final combination, we increased the number of components and features to n=k=16. This model achieved a score of 0.809 and an RMSE of 3283, which is similar to the third and fourth combinations. This suggests that increasing the number of components and features beyond a certain point does not lead to significant improvements in model performance.
Overall, our experiments suggest that there is a tradeoff between the number of features and components and model performance. Selecting too few or too many features and components can lead to a loss of important information and reduced model performance. However, selecting a slightly different combination of features and components can lead to similar model performance, indicating that there is some flexibility in the choice of features and components
pd.DataFrame(
zip(
lr_pca['regr'].coef_,
lr_pca['regr'].feature_names_in_
),
columns=['coef', 'features']
).sort_values(by='coef', key=np.abs, ascending=False)
| coef | features | |
|---|---|---|
| 11 | -4389.839670 | pca11 |
| 12 | 4014.886328 | pca12 |
| 0 | 1837.534086 | pca0 |
| 10 | 1117.000060 | pca10 |
| 3 | 884.020279 | pca3 |
| 7 | 528.899724 | pca7 |
| 8 | 449.801258 | pca8 |
| 5 | 353.357176 | pca5 |
| 4 | -316.957887 | pca4 |
| 9 | 305.364145 | pca9 |
| 2 | -267.716616 | pca2 |
| 1 | 237.683749 | pca1 |
| 6 | -79.486644 | pca6 |
# plotting the prominent features of principal component 12 with heatmap
plot_correlation_heatmap(pca, X_sel, 11)
# plotting the prominent features of principal component 12 with heatmap
plot_correlation_heatmap(pca, X_sel, 11)
# plotting the prominent features of principal component 1 with heatmap
plot_correlation_heatmap(pca, X_sel, 0)
# the evaluation metrics for linear regression(ridge) with model all the preprocessed Features
print('Evaluation metrics of Linear regression(Ridge) model with all the preprocessed Features:')
print("r2 score: ", lr_r2_score)
print('Root Mean Squared Error:', lr_rmse)
print('Mean Squared Error:', lr_mse)
print('Mean Absolute Error:', lr_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for linear regression(ridge) with Feature selection
print('Evaluation metrics of Linear regression(Ridge) with Feature selection, k=22:')
print("r2 score: ", lr_featsel_r2_score)
print('Root Mean Squared Error:', lr_featsel_rmse)
print('Mean Squared Error:', lr_featsel_mse)
print('Mean Absolute Error:', lr_featsel_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for linear regression(ridge) with PCA
print('Evaluation metrics of Linear regression(Ridge) with Feature selection, k=22 and PCA, n_components=13:')
print("r2 score: ", lr_pca_r2_score)
print('Root Mean Squared Error:', lr_pca_rmse)
print('Mean Squared Error:', lr_pca_mse)
print('Mean Absolute Error:', lr_pca_mae)
print()
Evaluation metrics of Linear regression(Ridge) model with all the preprocessed Features: r2 score: 0.844428399347362 Root Mean Squared Error: 2963.3071977481527 Mean Squared Error: 8781189.548226008 Mean Absolute Error: 2081.5998659552256 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of Linear regression(Ridge) with Feature selection, k=22: r2 score: 0.8261606050186868 Root Mean Squared Error: 3132.460597107935 Mean Squared Error: 9812309.392433802 Mean Absolute Error: 2229.2523894155524 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of Linear regression(Ridge) with Feature selection, k=22 and PCA, n_components=13: r2 score: 0.816377212486792 Root Mean Squared Error: 3219.398992686099 Mean Squared Error: 10364529.874108268 Mean Absolute Error: 2300.785755608655
The evaluation metrics show the performance of different linear regression models on the dataset. The first model used all the preprocessed features, and achieved a score of 0.809 and a root mean squared error (RMSE) of 3283. This indicates that the model explains 80.9% of the variance in the target variable, and on average, its predictions are off by about 3283 units.
The second model used feature selection with k=16, and achieved a score of 0.808 and an RMSE of 3291. This means that the model explains 80.8% of the variance in the target variable, but its predictions are slightly less accurate than the first model, with an average error of about 3291 units.
The third model used PCA with n_components=10, and achieved a score of 0.804 and an RMSE of 3327. This means that the model explains 80.4% of the variance in the target variable, but its predictions are the least accurate among the three models, with an average error of about 3327 units.
Overall, the results suggest that the model with all the preprocessed features performs slightly better than the model with feature selection, while the model with PCA performs the least accurately among the three models.
param_grid = dict(
regr__alpha=np.logspace(-5, 5, num=11),
)
param_grid
{'regr__alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
1.e+03, 1.e+04, 1.e+05])}
grid_lr = GridSearchCV(
lr, param_grid, return_train_score=True,
scoring='neg_root_mean_squared_error'
)
import time
start_time = time.time()
grid_lr.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge())]),
param_grid={'regr__alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
1.e+03, 1.e+04, 1.e+05])},
return_train_score=True, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge())]),
param_grid={'regr__alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
1.e+03, 1.e+04, 1.e+05])},
return_train_score=True, scoring='neg_root_mean_squared_error')Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
Ridge()
end_time = time.time()
total_time = end_time - start_time
print("Total time taken:", total_time)
Total time taken: 204.2441520690918
results_lr = pd.DataFrame(grid_lr.cv_results_)
results_lr.columns
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_regr__alpha', 'params', 'split0_test_score', 'split1_test_score',
'split2_test_score', 'split3_test_score', 'split4_test_score',
'mean_test_score', 'std_test_score', 'rank_test_score',
'split0_train_score', 'split1_train_score', 'split2_train_score',
'split3_train_score', 'split4_train_score', 'mean_train_score',
'std_train_score'],
dtype='object')
results_lr[ [ 'param_regr__alpha', 'params', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score', 'rank_test_score' ]
].sort_values('rank_test_score').rename(
columns=dict(
param_regr__alpha='alpha',
)
)
| alpha | params | mean_test_score | std_test_score | mean_train_score | std_train_score | rank_test_score | |
|---|---|---|---|---|---|---|---|
| 1 | 0.0001 | {'regr__alpha': 0.0001} | -2940.849337 | 21.710158 | -2932.040056 | 4.441074 | 1 |
| 0 | 0.00001 | {'regr__alpha': 1e-05} | -2940.851162 | 21.707313 | -2932.038308 | 4.441058 | 2 |
| 2 | 0.001 | {'regr__alpha': 0.001} | -2940.963669 | 21.716650 | -2932.173341 | 4.440763 | 3 |
| 3 | 0.01 | {'regr__alpha': 0.01} | -2944.489694 | 21.722553 | -2935.735495 | 4.430941 | 4 |
| 4 | 0.1 | {'regr__alpha': 0.1} | -2959.448727 | 21.714100 | -2950.770284 | 4.461738 | 5 |
| 5 | 1.0 | {'regr__alpha': 1.0} | -2977.097244 | 21.655551 | -2968.479414 | 4.495604 | 6 |
| 6 | 10.0 | {'regr__alpha': 10.0} | -2981.382747 | 21.621579 | -2972.800682 | 4.518187 | 7 |
| 7 | 100.0 | {'regr__alpha': 100.0} | -2985.995030 | 21.419188 | -2977.501523 | 4.549846 | 8 |
| 8 | 1000.0 | {'regr__alpha': 1000.0} | -3015.497283 | 20.787917 | -3007.098365 | 4.398417 | 9 |
| 9 | 10000.0 | {'regr__alpha': 10000.0} | -3101.994567 | 20.209641 | -3093.712870 | 4.465067 | 10 |
| 10 | 100000.0 | {'regr__alpha': 100000.0} | -3212.611298 | 19.620347 | -3204.835876 | 4.696120 | 11 |
lr_best_pipe = grid_lr.best_estimator_
lr_best_pipe
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge(alpha=0.0001))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', Ridge(alpha=0.0001))])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
Ridge(alpha=0.0001)
# Model performance evaluation
lr_best_pipe_r2_score = r2_score(y_test, lr_best_pipe.predict(X_test))
lr_best_pipe_rmse = rmse(y_test, lr_best_pipe.predict(X_test))
lr_best_pipe_mse = mean_squared_error(y_test, lr_best_pipe.predict(X_test))
lr_best_pipe_mae = mean_absolute_error(y_test, lr_best_pipe.predict(X_test))
# Print the evaluation metrics
print('Grid-search best_pipe Linear regression(Ridge) model with all the preprocessed Features:')
print("r2 score: ", lr_best_pipe_r2_score)
print('Root Mean Squared Error:', lr_best_pipe_rmse)
print('Mean Squared Error:', lr_best_pipe_mse)
print('Mean Absolute Error:', lr_best_pipe_mae)
Grid-search best_pipe Linear regression(Ridge) model with all the preprocessed Features: r2 score: 0.8480511004034516 Root Mean Squared Error: 2928.6016012153827 Mean Squared Error: 8576707.338641303 Mean Absolute Error: 2051.5179302551464
lr_best_pipe['regr'].feature_names_in_
array(['mileage', 'year_of_registration', 'crossover_car_and_van_True',
'standard_colour', 'standard_make', 'standard_model', 'body_type',
'fuel_type', 'manufacturer_popularity', 'condition', 'usage',
'mileage^2', 'mileage year_of_registration',
'mileage crossover_car_and_van_True', 'mileage standard_colour',
'mileage standard_make', 'mileage standard_model',
'mileage body_type', 'mileage fuel_type',
'mileage manufacturer_popularity', 'mileage condition',
'mileage usage', 'year_of_registration^2',
'year_of_registration crossover_car_and_van_True',
'year_of_registration standard_colour',
'year_of_registration standard_make',
'year_of_registration standard_model',
'year_of_registration body_type', 'year_of_registration fuel_type',
'year_of_registration manufacturer_popularity',
'year_of_registration condition', 'year_of_registration usage',
'crossover_car_and_van_True^2',
'crossover_car_and_van_True standard_colour',
'crossover_car_and_van_True standard_make',
'crossover_car_and_van_True standard_model',
'crossover_car_and_van_True body_type',
'crossover_car_and_van_True fuel_type',
'crossover_car_and_van_True manufacturer_popularity',
'crossover_car_and_van_True condition',
'crossover_car_and_van_True usage', 'standard_colour^2',
'standard_colour standard_make', 'standard_colour standard_model',
'standard_colour body_type', 'standard_colour fuel_type',
'standard_colour manufacturer_popularity',
'standard_colour condition', 'standard_colour usage',
'standard_make^2', 'standard_make standard_model',
'standard_make body_type', 'standard_make fuel_type',
'standard_make manufacturer_popularity', 'standard_make condition',
'standard_make usage', 'standard_model^2',
'standard_model body_type', 'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage', 'body_type^2',
'body_type fuel_type', 'body_type manufacturer_popularity',
'body_type condition', 'body_type usage', 'fuel_type^2',
'fuel_type manufacturer_popularity', 'fuel_type condition',
'fuel_type usage', 'manufacturer_popularity^2',
'manufacturer_popularity condition',
'manufacturer_popularity usage', 'condition^2', 'condition usage',
'usage^2'], dtype=object)
lr_best_pipe['regr'].coef_
array([-1.68254258e+05, -1.31015182e+05, 1.97328445e+04, -1.43629200e+03,
1.49619843e+05, -8.87356891e+05, 2.10797902e+05, -4.44979834e+04,
-7.30464005e+04, -2.57996914e+05, -3.98555440e+04, 1.89966586e+02,
1.69047175e+05, 7.46323854e+00, -3.05089616e+02, -1.55352949e+03,
-2.80118574e+02, -4.51196377e+02, 7.74237986e+02, 6.91527388e+02,
-1.68935820e+03, -9.34653809e+01, 1.29477394e+05, -3.88018263e+04,
1.35142480e+03, -1.50998679e+05, 8.94490973e+05, -2.04989772e+05,
4.32372782e+04, 6.33437091e+04, 2.58781904e+05, 3.99243443e+04,
1.96969488e+04, -6.75647910e+01, -2.05747931e+02, 3.58294408e+02,
2.36113075e+02, -7.23598262e+02, -4.27679404e+01, -1.80546680e+01,
3.89035910e+01, -3.48501359e+02, 9.96334876e+02, 2.60493951e+02,
3.26648450e+01, -2.96036295e+02, 7.19041216e+02, 4.52610308e+02,
-4.33004190e+02, 2.32047665e+03, 1.47205166e+03, 3.62730891e+03,
-2.85740334e+03, -1.66572147e+03, 2.02768127e+03, -5.70630005e+02,
-1.39527874e+02, -1.89702538e+03, -3.17160579e+03, 3.73657028e+03,
-4.63788598e+03, -3.79041889e+02, -7.60194131e+03, 2.21906209e+03,
-2.07976543e+03, 1.78553463e+03, 4.41002455e+02, 1.79990813e+03,
-7.13668870e+02, 3.39181594e+03, 6.28830204e+02, 9.84054443e+03,
-8.91335949e+02, 1.24395760e+03, -7.51819160e+01, -2.22442681e+03,
-1.88598051e+02])
pd.DataFrame(
zip(
lr_best_pipe['regr'].coef_,
lr_best_pipe['regr'].feature_names_in_
),
columns=['coef', 'features']
).sort_values(by='coef', key=np.abs, ascending=False)
| coef | features | |
|---|---|---|
| 26 | 894490.973046 | year_of_registration standard_model |
| 5 | -887356.890682 | standard_model |
| 30 | 258781.904021 | year_of_registration condition |
| 9 | -257996.914384 | condition |
| 6 | 210797.902303 | body_type |
| ... | ... | ... |
| 38 | -42.767940 | crossover_car_and_van_True manufacturer_popula... |
| 40 | 38.903591 | crossover_car_and_van_True usage |
| 44 | 32.664845 | standard_colour body_type |
| 39 | -18.054668 | crossover_car_and_van_True condition |
| 13 | 7.463239 | mileage crossover_car_and_van_True |
77 rows × 2 columns
def plot_rf_feat_imp_barh(rf, feat_names, ax=None, top_feat_k=10, style_kws={}):
""" """
if ax is None:
fig, ax = plt.subplots()
return pd.Series(
rf.coef_,
index=feat_names
).sort_values().tail(top_feat_k).plot.barh(**style_kws)
plot_rf_feat_imp_barh(
lr_best_pipe['regr'],
lr_best_pipe['regr'].feature_names_in_,
);
rfr = create_regr_pipe(RandomForestRegressor())
# fitting the training data
rfr.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', RandomForestRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', RandomForestRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
RandomForestRegressor()
# Model performance evaluation
rfr_r2_score = r2_score(y_test, rfr.predict(X_test))
rfr_rmse = rmse(y_test, rfr.predict(X_test))
rfr_mse = mean_squared_error(y_test, rfr.predict(X_test))
rfr_mae = mean_absolute_error(y_test, rfr.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Random forest regressor model with all the preprocessed Features:')
print("r2 score: ", rfr_r2_score)
print('Root Mean Squared Error:', rfr_rmse)
print('Mean Squared Error:', rfr_mse)
print('Mean Absolute Error:', rfr_mae)
Evaluation metrics of Random forest regressor model with all the preprocessed Features: r2 score: 0.9243398450133282 Root Mean Squared Error: 2066.546248837117 Mean Squared Error: 4270613.398582761 Mean Absolute Error: 1376.9511766147268
rfr_featsel = create_regr_pipe_featsel(RandomForestRegressor())
# fitting the training data
rfr_featsel.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', RandomForestRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', RandomForestRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
RandomForestRegressor()
rfr_featsel['featsel'].get_feature_names_out()
array(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage',
'standard_model^2', 'standard_model body_type',
'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage',
'body_type condition', 'body_type usage', 'fuel_type condition',
'manufacturer_popularity condition', 'condition^2'], dtype=object)
# Model performance evaluation
rfr_featsel_r2_score = r2_score(y_test, rfr_featsel.predict(X_test))
rfr_featsel_rmse = rmse(y_test, rfr_featsel.predict(X_test))
rfr_featsel_mse = mean_squared_error(y_test, rfr_featsel.predict(X_test))
rfr_featsel_mae = mean_absolute_error(y_test, rfr_featsel.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Random forest regressor model with Feature selection, k=22:')
print("r2 score: ", rfr_featsel_r2_score)
print('Root Mean Squared Error:', rfr_featsel_rmse)
print('Mean Squared Error:', rfr_featsel_mse)
print('Mean Absolute Error:', rfr_featsel_mae)
Evaluation metrics of Random forest regressor model with Feature selection, k=22: r2 score: 0.9253017672275626 Root Mean Squared Error: 2053.367479279037 Mean Squared Error: 4216318.004960747 Mean Absolute Error: 1400.5649137193361
rfr_pca = create_regr_pipe_pca(RandomForestRegressor())
# fitting the training data
rfr_pca.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)),
('regr', RandomForestRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)),
('regr', RandomForestRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
PCA(n_components=13)
RandomForestRegressor()
rfr_pca['pca'].get_feature_names_out()
array(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7',
'pca8', 'pca9', 'pca10', 'pca11', 'pca12'], dtype=object)
# Model performance evaluation
rfr_pca_r2_score = r2_score(y_test, rfr_pca.predict(X_test))
rfr_pca_rmse = rmse(y_test, rfr_pca.predict(X_test))
rfr_pca_mse = mean_squared_error(y_test, rfr_pca.predict(X_test))
rfr_pca_mae = mean_absolute_error(y_test, rfr_pca.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Random forest regressor model with Feature selection, k=25 and PCA, n_components=15:')
print("r2 score: ", rfr_pca_r2_score)
print('Root Mean Squared Error:', rfr_pca_rmse)
print('Mean Squared Error:', rfr_pca_mse)
print('Mean Absolute Error:', rfr_pca_mae)
Evaluation metrics of Random forest regressor model with Feature selection, k=25 and PCA, n_components=15: r2 score: 0.9201477726491154 Root Mean Squared Error: 2123.0246326107103 Mean Squared Error: 4507233.590671841 Mean Absolute Error: 1434.7331830815838
pd.DataFrame(
zip(
rfr_pca['regr'].feature_importances_,
rfr_pca['regr'].feature_names_in_
),
columns=['importance', 'features']
).sort_values(by='importance', key=np.abs, ascending=False)
| importance | features | |
|---|---|---|
| 0 | 0.851173 | pca0 |
| 1 | 0.024606 | pca1 |
| 4 | 0.023409 | pca4 |
| 3 | 0.022661 | pca3 |
| 12 | 0.013504 | pca12 |
| 2 | 0.011957 | pca2 |
| 6 | 0.011451 | pca6 |
| 5 | 0.010471 | pca5 |
| 11 | 0.008484 | pca11 |
| 9 | 0.007437 | pca9 |
| 7 | 0.006234 | pca7 |
| 10 | 0.004946 | pca10 |
| 8 | 0.003668 | pca8 |
# plotting the prominent features of principal component 1 with heatmap
plot_correlation_heatmap(pca, X_sel, 0)
# plotting the prominent features of principal component 2 with heatmap
plot_correlation_heatmap(pca, X_sel, 1)
# plotting the prominent features of principal component 6 with heatmap
plot_correlation_heatmap(pca, X_sel, 4)
# the evaluation metrics for Random forest regressor with model all the preprocessed Features
print('Evaluation metrics of Random forest regressor model with all the preprocessed Features:')
print("r2 score: ", rfr_r2_score)
print('Root Mean Squared Error:', rfr_rmse)
print('Mean Squared Error:', rfr_mse)
print('Mean Absolute Error:', rfr_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for Random forest regressor model with Feature selection
print('Evaluation metrics of Random forest regressor model with Feature selection, k=22:')
print("r2 score: ", rfr_featsel_r2_score)
print('Root Mean Squared Error:', rfr_featsel_rmse)
print('Mean Squared Error:', rfr_featsel_mse)
print('Mean Absolute Error:', rfr_featsel_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for Random forest regressor model with PCA
print('Evaluation metrics of Random forest regressor model with Feature selection, k=22 and PCA, n_components=13:')
print("r2 score: ", rfr_pca_r2_score)
print('Root Mean Squared Error:', rfr_pca_rmse)
print('Mean Squared Error:', rfr_pca_mse)
print('Mean Absolute Error:', rfr_pca_mae)
print()
Evaluation metrics of Random forest regressor model with all the preprocessed Features: r2 score: 0.9243398450133282 Root Mean Squared Error: 2066.546248837117 Mean Squared Error: 4270613.398582761 Mean Absolute Error: 1376.9511766147268 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of Random forest regressor model with Feature selection, k=22: r2 score: 0.9253017672275626 Root Mean Squared Error: 2053.367479279037 Mean Squared Error: 4216318.004960747 Mean Absolute Error: 1400.5649137193361 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of Random forest regressor model with Feature selection, k=22 and PCA, n_components=13: r2 score: 0.9201477726491154 Root Mean Squared Error: 2123.0246326107103 Mean Squared Error: 4507233.590671841 Mean Absolute Error: 1434.7331830815838
param_grid = { 'regrmin_samples_leaf': [6, 9, 12], 'regrmin_samples_split': [6, 8, 10], 'regrmax_depth': [None, 5, 10], 'regrmax_features': ['auto', 'sqrt', 'log2'] }
param_grid = {
'regr__min_samples_leaf': [6, 9, 12],
'regr__min_samples_split': [6, 8, 10],
'regr__max_features': ['sqrt', 'log2']
}
rfr_grid = GridSearchCV(
rfr_featsel, param_grid, return_train_score=True,
scoring='neg_root_mean_squared_error'
)
import time
start_time = time.time()
rfr_grid.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', RandomForestRegressor())]),
param_grid={'regr__max_features': ['sqrt', 'log2'],
'regr__min_samples_leaf': [6, 9, 12],
'regr__min_samples_split': [6, 8, 10]},
return_train_score=True, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', RandomForestRegressor())]),
param_grid={'regr__max_features': ['sqrt', 'log2'],
'regr__min_samples_leaf': [6, 9, 12],
'regr__min_samples_split': [6, 8, 10]},
return_train_score=True, scoring='neg_root_mean_squared_error')Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', RandomForestRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
RandomForestRegressor()
end_time = time.time()
total_time = end_time - start_time
print("Total time taken:", total_time)
Total time taken: 2825.572900056839
rfr_results = pd.DataFrame(rfr_grid.cv_results_)
rfr_results.columns
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_regr__max_features', 'param_regr__min_samples_leaf',
'param_regr__min_samples_split', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'split3_test_score',
'split4_test_score', 'mean_test_score', 'std_test_score',
'rank_test_score', 'split0_train_score', 'split1_train_score',
'split2_train_score', 'split3_train_score', 'split4_train_score',
'mean_train_score', 'std_train_score'],
dtype='object')
rfr_results[ [ 'param_regr__max_features', 'param_regr__min_samples_leaf', 'param_regr__min_samples_split', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score', 'rank_test_score' ]
].sort_values('rank_test_score').rename(
columns=dict(
param_regr__min_samples_leaf='min_samples_leaf',
param_regr__min_samples_split='min_samples_split',
param_regr__max_features = 'max_features'
)
)
| max_features | min_samples_leaf | min_samples_split | mean_test_score | std_test_score | mean_train_score | std_train_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | sqrt | 6 | 6 | -2049.962290 | 18.555652 | -1900.627397 | 3.708405 | 1 |
| 10 | log2 | 6 | 8 | -2050.120766 | 18.550933 | -1900.804744 | 3.795331 | 2 |
| 11 | log2 | 6 | 10 | -2050.274862 | 18.916334 | -1901.181807 | 3.744739 | 3 |
| 2 | sqrt | 6 | 10 | -2050.369348 | 19.335764 | -1901.250574 | 4.032631 | 4 |
| 9 | log2 | 6 | 6 | -2050.526482 | 18.630142 | -1900.678704 | 3.892057 | 5 |
| 1 | sqrt | 6 | 8 | -2050.756499 | 18.627939 | -1900.785580 | 4.050979 | 6 |
| 14 | log2 | 9 | 10 | -2073.774600 | 19.057211 | -1955.036839 | 4.622660 | 7 |
| 5 | sqrt | 9 | 10 | -2074.025782 | 18.989398 | -1954.942416 | 4.612270 | 8 |
| 13 | log2 | 9 | 8 | -2074.529057 | 17.833773 | -1955.506478 | 5.061760 | 9 |
| 3 | sqrt | 9 | 6 | -2074.615409 | 19.242223 | -1955.140206 | 4.355991 | 10 |
| 12 | log2 | 9 | 6 | -2075.014257 | 18.844282 | -1955.585639 | 4.358526 | 11 |
| 4 | sqrt | 9 | 8 | -2075.905300 | 19.305352 | -1955.790391 | 4.323940 | 12 |
| 6 | sqrt | 12 | 6 | -2096.235795 | 19.174995 | -1994.982510 | 4.474310 | 13 |
| 16 | log2 | 12 | 8 | -2096.736846 | 19.164042 | -1994.836868 | 4.795302 | 14 |
| 15 | log2 | 12 | 6 | -2097.129943 | 19.877170 | -1995.191400 | 3.658333 | 15 |
| 8 | sqrt | 12 | 10 | -2097.267247 | 19.295350 | -1995.122106 | 4.928546 | 16 |
| 17 | log2 | 12 | 10 | -2097.504455 | 19.544558 | -1995.301768 | 3.854381 | 17 |
| 7 | sqrt | 12 | 8 | -2097.897226 | 19.370740 | -1995.657547 | 4.596817 | 18 |
rfr_best_pipe = rfr_grid.best_estimator_
rfr_best_pipe
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr',
RandomForestRegressor(max_features='sqrt', min_samples_leaf=6,
min_samples_split=6))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr',
RandomForestRegressor(max_features='sqrt', min_samples_leaf=6,
min_samples_split=6))])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
RandomForestRegressor(max_features='sqrt', min_samples_leaf=6,
min_samples_split=6)# Model performance evaluation
rfr_best_pipe_r2_score = r2_score(y_test, rfr_best_pipe.predict(X_test))
rfr_best_pipe_rmse = rmse(y_test, rfr_best_pipe.predict(X_test))
rfr_best_pipe_mse = mean_squared_error(y_test, rfr_best_pipe.predict(X_test))
rfr_best_pipe_mae = mean_absolute_error(y_test, rfr_best_pipe.predict(X_test))
# Print the evaluation metrics
print('Grid-search best_pipe Random Forest Regressor model with Feature selection, k=25:')
print("r2 score: ", rfr_best_pipe_r2_score)
print('Root Mean Squared Error:', rfr_best_pipe_rmse)
print('Mean Squared Error:', rfr_best_pipe_mse)
print('Mean Absolute Error:', rfr_best_pipe_mae)
Grid-search best_pipe Random Forest Regressor model with Feature selection, k=25: r2 score: 0.9267501945981728 Root Mean Squared Error: 2033.3622359111484 Mean Squared Error: 4134561.982429585 Mean Absolute Error: 1384.2918195277766
rfr_best_pipe['regr'].feature_names_in_
array(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage',
'standard_model^2', 'standard_model body_type',
'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage',
'body_type condition', 'body_type usage', 'fuel_type condition',
'manufacturer_popularity condition', 'condition^2'], dtype=object)
rfr_best_pipe['regr'].feature_importances_
array([0.02469534, 0.02882418, 0.00545168, 0.02248653, 0.05767102,
0.02328537, 0.03367985, 0.00735674, 0.01903407, 0.11253282,
0.09082786, 0.04121723, 0.01970034, 0.01913071, 0.0224855 ,
0.2209161 , 0.15936871, 0.04689474, 0.010352 , 0.01415751,
0.01328135, 0.00665034])
pd.DataFrame(
zip(
rfr_best_pipe['regr'].feature_importances_,
rfr_best_pipe['regr'].feature_names_in_
),
columns=['importance', 'features']
).sort_values(by='importance', key=np.abs, ascending=False)
| importance | features | |
|---|---|---|
| 15 | 0.220916 | standard_model condition |
| 16 | 0.159369 | standard_model usage |
| 9 | 0.112533 | standard_make condition |
| 10 | 0.090828 | standard_make usage |
| 4 | 0.057671 | year_of_registration standard_model |
| 17 | 0.046895 | body_type condition |
| 11 | 0.041217 | standard_model^2 |
| 6 | 0.033680 | standard_colour standard_model |
| 1 | 0.028824 | standard_model |
| 0 | 0.024695 | year_of_registration |
| 5 | 0.023285 | year_of_registration condition |
| 3 | 0.022487 | year_of_registration^2 |
| 14 | 0.022485 | standard_model manufacturer_popularity |
| 12 | 0.019700 | standard_model body_type |
| 13 | 0.019131 | standard_model fuel_type |
| 8 | 0.019034 | standard_make standard_model |
| 19 | 0.014158 | fuel_type condition |
| 20 | 0.013281 | manufacturer_popularity condition |
| 18 | 0.010352 | body_type usage |
| 7 | 0.007357 | standard_colour condition |
| 21 | 0.006650 | condition^2 |
| 2 | 0.005452 | condition |
def plot_rf_feat_imp_barh(rf, feat_names, ax=None, top_feat_k=10, style_kws={}):
""" """
if ax is None:
fig, ax = plt.subplots()
return pd.Series(
rf.feature_importances_,
index=feat_names
).sort_values().tail(top_feat_k).plot.barh(**style_kws)
plot_rf_feat_imp_barh(
rfr_best_pipe['regr'],
rfr_best_pipe['regr'].feature_names_in_,
);
hgb = create_regr_pipe(HistGradientBoostingRegressor())
# fitting the training data
hgb.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', HistGradientBoostingRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', HistGradientBoostingRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
HistGradientBoostingRegressor()
# Model performance evaluation
hgb_r2_score = r2_score(y_test, hgb.predict(X_test))
hgb_rmse = rmse(y_test, hgb.predict(X_test))
hgb_mse = mean_squared_error(y_test, hgb.predict(X_test))
hgb_mae = mean_absolute_error(y_test, hgb.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of HistGradient Boosting Regressor model with all the preprocessed Features:')
print("r2 score: ", hgb_r2_score)
print('Root Mean Squared Error:', hgb_rmse)
print('Mean Squared Error:', hgb_mse)
print('Mean Absolute Error:', hgb_mae)
Evaluation metrics of HistGradient Boosting Regressor model with all the preprocessed Features: r2 score: 0.9171126554514961 Root Mean Squared Error: 2162.995565445511 Mean Squared Error: 4678549.816136947 Mean Absolute Error: 1499.707476089365
hgb_featsel = create_regr_pipe_featsel(HistGradientBoostingRegressor())
# fitting the training data
hgb_featsel.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', HistGradientBoostingRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('regr', HistGradientBoostingRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
HistGradientBoostingRegressor()
hgb_featsel['featsel'].get_feature_names_out()
array(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage',
'standard_model^2', 'standard_model body_type',
'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage',
'body_type condition', 'body_type usage', 'fuel_type condition',
'manufacturer_popularity condition', 'condition^2'], dtype=object)
# Model performance evaluation
hgb_featsel_r2_score = r2_score(y_test, hgb_featsel.predict(X_test))
hgb_featsel_rmse = rmse(y_test, hgb_featsel.predict(X_test))
hgb_featsel_mse = mean_squared_error(y_test, hgb_featsel.predict(X_test))
hgb_featsel_mae = mean_absolute_error(y_test, hgb_featsel.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25:')
print("r2 score: ", hgb_featsel_r2_score)
print('Root Mean Squared Error:', hgb_featsel_rmse)
print('Mean Squared Error:', hgb_featsel_mse)
print('Mean Absolute Error:', hgb_featsel_mae)
Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25: r2 score: 0.9094270965501591 Root Mean Squared Error: 2261.05257465641 Mean Squared Error: 5112358.74536038 Mean Absolute Error: 1586.867414249759
hgb_pca = create_regr_pipe_pca(HistGradientBoostingRegressor())
# fitting the training data
hgb_pca.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)),
('regr', HistGradientBoostingRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('featsel',
SelectKBest(k=22,
score_func=<function f_regression at 0x7fc2c16e4af0>)),
('pca', PCA(n_components=13)),
('regr', HistGradientBoostingRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
PCA(n_components=13)
HistGradientBoostingRegressor()
hgb_pca['pca'].get_feature_names_out()
array(['pca0', 'pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7',
'pca8', 'pca9', 'pca10', 'pca11', 'pca12'], dtype=object)
# Model performance evaluation
hgb_pca_r2_score = r2_score(y_test, hgb_pca.predict(X_test))
hgb_pca_rmse = rmse(y_test, hgb_pca.predict(X_test))
hgb_pca_mse = mean_squared_error(y_test, hgb_pca.predict(X_test))
hgb_pca_mae = mean_absolute_error(y_test, hgb_pca.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25 and and PCA, n_components=15:')
print("r2 score: ", hgb_pca_r2_score)
print('Root Mean Squared Error:', hgb_pca_rmse)
print('Mean Squared Error:', hgb_pca_mse)
print('Mean Absolute Error:', hgb_pca_mae)
Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25 and and PCA, n_components=15: r2 score: 0.8888907627632436 Root Mean Squared Error: 2504.301415205747 Mean Squared Error: 6271525.578201507 Mean Absolute Error: 1755.8619267270412
# the evaluation metrics for HistGradientBoostingRegressor with model all the preprocessed Features
print('Evaluation metrics of HistGradient Boosting Regressor model with all the preprocessed Features:')
print("r2 score: ", hgb_r2_score)
print('Root Mean Squared Error:', hgb_rmse)
print('Mean Squared Error:', hgb_mse)
print('Mean Absolute Error:', hgb_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for HistGradientBoostingRegressor model with Feature selection
print('Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25:')
print("r2 score: ", hgb_featsel_r2_score)
print('Root Mean Squared Error:', hgb_featsel_rmse)
print('Mean Squared Error:', hgb_featsel_mse)
print('Mean Absolute Error:', hgb_featsel_mae)
print('---------------------------------------------------------------------------------------------------------------------')
# the evaluation metrics for HistGradientBoostingRegressor model with PCA
print('Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25 and and PCA, n_components=15:')
print("r2 score: ", hgb_pca_r2_score)
print('Root Mean Squared Error:', hgb_pca_rmse)
print('Mean Squared Error:', hgb_pca_mse)
print('Mean Absolute Error:', hgb_pca_mae)
print()
Evaluation metrics of HistGradient Boosting Regressor model with all the preprocessed Features: r2 score: 0.9171126554514961 Root Mean Squared Error: 2162.995565445511 Mean Squared Error: 4678549.816136947 Mean Absolute Error: 1499.707476089365 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25: r2 score: 0.9094270965501591 Root Mean Squared Error: 2261.05257465641 Mean Squared Error: 5112358.74536038 Mean Absolute Error: 1586.867414249759 --------------------------------------------------------------------------------------------------------------------- Evaluation metrics of HistGradient Boosting Regressor model with Feature selection, k=25 and and PCA, n_components=15: r2 score: 0.8888907627632436 Root Mean Squared Error: 2504.301415205747 Mean Squared Error: 6271525.578201507 Mean Absolute Error: 1755.8619267270412
The results of the HistGradientBoostingRegressor models show that the model trained on all the preprocessed features achieved the highest score of 0.913 and the lowest Root Mean Squared Error (RMSE) of 2214 pounds when predicting the price of cars. This means that the model with all the features was the best at capturing the underlying patterns in the data and making accurate predictions.
The model trained on the feature selection subset of the preprocessed features achieved a slightly lower score of 0.913 and a slightly higher RMSE of 2216 pounds. This indicates that reducing the number of features may have resulted in the loss of some important information for predicting the price of cars, but the impact was minimal.
The model trained on the PCA subset of the preprocessed features achieved a lower score of 0.908 and an RMSE of 2275 pounds, which is higher than the models with all the features and feature selection. This suggests that the PCA transformation may not have captured all the important features in the data, leading to decreased performance in predicting the price of cars.
Overall, the HistGradientBoostingRegressor model trained on all the preprocessed features achieved the best performance in terms of both score and RMSE, indicating that it is the most effective model for predicting the price of cars.
param_grid = {
'regr__learning_rate': [0.1, 0.01],
'regr__max_iter': [50, 150],
'regr__max_depth': [1, 6]
}
param_grid
{'regr__learning_rate': [0.1, 0.01],
'regr__max_iter': [50, 150],
'regr__max_depth': [1, 6]}
hgb_grid = GridSearchCV(
hgb, param_grid, return_train_score=True,
scoring='neg_root_mean_squared_error'
)
import time
start_time = time.time()
hgb_grid.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor())]),
param_grid={'regr__learning_rate': [0.1, 0.01],
'regr__max_depth': [1, 6],
'regr__max_iter': [50, 150]},
return_train_score=True, scoring='neg_root_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncode...
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor())]),
param_grid={'regr__learning_rate': [0.1, 0.01],
'regr__max_depth': [1, 6],
'regr__max_iter': [50, 150]},
return_train_score=True, scoring='neg_root_mean_squared_error')Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', HistGradientBoostingRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
HistGradientBoostingRegressor()
end_time = time.time()
total_time = end_time - start_time
print("Total time taken:", total_time)
Total time taken: 417.30329513549805
hgb_results = pd.DataFrame(hgb_grid.cv_results_)
hgb_results.columns
Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
'param_regr__learning_rate', 'param_regr__max_depth',
'param_regr__max_iter', 'params', 'split0_test_score',
'split1_test_score', 'split2_test_score', 'split3_test_score',
'split4_test_score', 'mean_test_score', 'std_test_score',
'rank_test_score', 'split0_train_score', 'split1_train_score',
'split2_train_score', 'split3_train_score', 'split4_train_score',
'mean_train_score', 'std_train_score'],
dtype='object')
hgb_results[ [ 'param_regr__learning_rate', 'param_regr__max_depth',
'param_regr__max_iter', 'mean_test_score', 'std_test_score',
'mean_train_score', 'std_train_score', 'rank_test_score' ]
].sort_values('rank_test_score').rename(
columns=dict(
param_regr__learning_rate='learning_rate',
param_regr__max_depth='max_depth',
param_regr__max_iter='max_iter'
)
)
| learning_rate | max_depth | max_iter | mean_test_score | std_test_score | mean_train_score | std_train_score | rank_test_score | |
|---|---|---|---|---|---|---|---|---|
| 3 | 0.1 | 6 | 150 | -2129.501481 | 19.840412 | -2087.071148 | 3.739299 | 1 |
| 2 | 0.1 | 6 | 50 | -2406.945652 | 25.931056 | -2381.625184 | 4.022284 | 2 |
| 1 | 0.1 | 1 | 150 | -2919.246087 | 23.823296 | -2907.512184 | 4.301994 | 3 |
| 7 | 0.01 | 6 | 150 | -3216.447662 | 16.317134 | -3202.565758 | 3.020932 | 4 |
| 0 | 0.1 | 1 | 50 | -3246.453091 | 23.069263 | -3237.229380 | 5.877502 | 5 |
| 5 | 0.01 | 1 | 150 | -4506.438145 | 17.332734 | -4502.384001 | 4.060647 | 6 |
| 6 | 0.01 | 6 | 50 | -5156.965230 | 12.956562 | -5151.577253 | 0.897111 | 7 |
| 4 | 0.01 | 1 | 50 | -6000.265337 | 16.207455 | -5998.788252 | 3.805977 | 8 |
hgb_best_pipe = hgb_grid.best_estimator_
hgb_best_pipe
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6, max_iter=150))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6, max_iter=150))])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
HistGradientBoostingRegressor(max_depth=6, max_iter=150)
# Model performance evaluation
hgb_best_pipe_r2_score = r2_score(y_test, hgb_best_pipe.predict(X_test))
hgb_best_pipe_rmse = rmse(y_test, hgb_best_pipe.predict(X_test))
hgb_best_pipe_mse = mean_squared_error(y_test, hgb_best_pipe.predict(X_test))
hgb_best_pipe_mae = mean_absolute_error(y_test, hgb_best_pipe.predict(X_test))
# Print the evaluation metrics
print('Grid-search best_pipe HistGradient Boosting Regressor model with all the preprocessed Features:')
print("r2 score: ", hgb_best_pipe_r2_score)
print('Root Mean Squared Error:', hgb_best_pipe_rmse)
print('Mean Squared Error:', hgb_best_pipe_mse)
print('Mean Absolute Error:', hgb_best_pipe_mae)
Grid-search best_pipe HistGradient Boosting Regressor model with all the preprocessed Features: r2 score: 0.9198402296448764 Root Mean Squared Error: 2127.109014082084 Mean Squared Error: 4524592.757789254 Mean Absolute Error: 1466.6708484424946
hgb_best_pipe['regr'].feature_names_in_
array(['mileage', 'year_of_registration', 'crossover_car_and_van_True',
'standard_colour', 'standard_make', 'standard_model', 'body_type',
'fuel_type', 'manufacturer_popularity', 'condition', 'usage',
'mileage^2', 'mileage year_of_registration',
'mileage crossover_car_and_van_True', 'mileage standard_colour',
'mileage standard_make', 'mileage standard_model',
'mileage body_type', 'mileage fuel_type',
'mileage manufacturer_popularity', 'mileage condition',
'mileage usage', 'year_of_registration^2',
'year_of_registration crossover_car_and_van_True',
'year_of_registration standard_colour',
'year_of_registration standard_make',
'year_of_registration standard_model',
'year_of_registration body_type', 'year_of_registration fuel_type',
'year_of_registration manufacturer_popularity',
'year_of_registration condition', 'year_of_registration usage',
'crossover_car_and_van_True^2',
'crossover_car_and_van_True standard_colour',
'crossover_car_and_van_True standard_make',
'crossover_car_and_van_True standard_model',
'crossover_car_and_van_True body_type',
'crossover_car_and_van_True fuel_type',
'crossover_car_and_van_True manufacturer_popularity',
'crossover_car_and_van_True condition',
'crossover_car_and_van_True usage', 'standard_colour^2',
'standard_colour standard_make', 'standard_colour standard_model',
'standard_colour body_type', 'standard_colour fuel_type',
'standard_colour manufacturer_popularity',
'standard_colour condition', 'standard_colour usage',
'standard_make^2', 'standard_make standard_model',
'standard_make body_type', 'standard_make fuel_type',
'standard_make manufacturer_popularity', 'standard_make condition',
'standard_make usage', 'standard_model^2',
'standard_model body_type', 'standard_model fuel_type',
'standard_model manufacturer_popularity',
'standard_model condition', 'standard_model usage', 'body_type^2',
'body_type fuel_type', 'body_type manufacturer_popularity',
'body_type condition', 'body_type usage', 'fuel_type^2',
'fuel_type manufacturer_popularity', 'fuel_type condition',
'fuel_type usage', 'manufacturer_popularity^2',
'manufacturer_popularity condition',
'manufacturer_popularity usage', 'condition^2', 'condition usage',
'usage^2'], dtype=object)
Dummy Regressor model
dummy = create_regr_pipe(DummyRegressor())
dummy.fit(X_train, y_train)
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', DummyRegressor())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
hand...
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr', DummyRegressor())])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
DummyRegressor()
# Model performance evaluation
dummy_rmse = rmse(y_test, dummy.predict(X_test))
# Print the evaluation metrics
print('Dummy Regressor model with all the preprocessed Features:')
print('Root Mean Squared Error:', dummy_rmse)
Dummy Regressor model with all the preprocessed Features: Root Mean Squared Error: 7512.967598655133
# Print the evaluation metrics
print('Dummy Regressor model with all the preprocessed Features:')
print('Root Mean Squared Error:', dummy_rmse)
print('-----------------------------------------------------------------------------------------------------')
# the evaluation metrics for linear regression with model all the preprocessed Features
print('Grid-search best_pipe Linear regression(Ridge) model with all the preprocessed Features:')
print("r2 score: ", lr_best_pipe_r2_score)
print('Root Mean Squared Error:', lr_best_pipe_rmse)
print('Mean Squared Error:', lr_best_pipe_mse)
print('Mean Absolute Error:', lr_best_pipe_mae)
print('-----------------------------------------------------------------------------------------------------')
# the evaluation metrics for Random forest regressor with model all the preprocessed Features
print('Grid-search best_pipe Random Forest Regressor model with selected Features:')
print("r2 score: ", rfr_best_pipe_r2_score)
print('Root Mean Squared Error:', rfr_best_pipe_rmse)
print('Mean Squared Error:', rfr_best_pipe_mse)
print('Mean Absolute Error:', rfr_best_pipe_mae)
print('-----------------------------------------------------------------------------------------------------')
# the evaluation metrics for HistGradientBoostingRegressor with model all the preprocessed Features
print('Grid-search best_pipe HistGradient Boosting Regressor model with all the preprocessed Features:')
print("r2 score: ", hgb_best_pipe_r2_score)
print('Root Mean Squared Error:', hgb_best_pipe_rmse)
print('Mean Squared Error:', hgb_best_pipe_mse)
print('Mean Absolute Error:', hgb_best_pipe_mae)
Dummy Regressor model with all the preprocessed Features: Root Mean Squared Error: 7512.967598655133 ----------------------------------------------------------------------------------------------------- Grid-search best_pipe Linear regression(Ridge) model with all the preprocessed Features: r2 score: 0.8480511004034516 Root Mean Squared Error: 2928.6016012153827 Mean Squared Error: 8576707.338641303 Mean Absolute Error: 2051.5179302551464 ----------------------------------------------------------------------------------------------------- Grid-search best_pipe Random Forest Regressor model with selected Features: r2 score: 0.9267501945981728 Root Mean Squared Error: 2033.3622359111484 Mean Squared Error: 4134561.982429585 Mean Absolute Error: 1384.2918195277766 ----------------------------------------------------------------------------------------------------- Grid-search best_pipe HistGradient Boosting Regressor model with all the preprocessed Features: r2 score: 0.9198402296448764 Root Mean Squared Error: 2127.109014082084 Mean Squared Error: 4524592.757789254 Mean Absolute Error: 1466.6708484424946
build an ensemble with best performing models/configurations
ensembled_1 = [ lr_best_pipe, rfr_best_pipe, hgb_best_pipe ]
for est in ensembled_1:
est.fit(X_train, y_train)
ensemble_1 = VotingRegressor(
[
("lr", lr_best_pipe),
("rf", rfr_best_pipe),
("hbg", hgb_best_pipe)
]
)
ensemble_1.fit(X_train, y_train)
VotingRegressor(estimators=[('lr',
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
O...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6,
max_iter=150))]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. VotingRegressor(estimators=[('lr',
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
O...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6,
max_iter=150))]))])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
Ridge(alpha=0.0001)
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
RandomForestRegressor(max_features='sqrt', min_samples_leaf=6,
min_samples_split=6)Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
HistGradientBoostingRegressor(max_depth=6, max_iter=150)
# Model performance evaluation
ens_1_r2_score = r2_score(y_test, ensemble_1.predict(X_test))
ens_1_rmse = rmse(y_test, ensemble_1.predict(X_test))
ens_1_mse = mean_squared_error(y_test, ensemble_1.predict(X_test))
ens_1_mae = mean_absolute_error(y_test, ensemble_1.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Ensemble using all the best regression model:')
print("r2 score: ", ens_1_r2_score)
print('Root Mean Squared Error:', ens_1_rmse)
print('Mean Squared Error:', ens_1_mse)
print('Mean Absolute Error:', ens_1_mae)
Evaluation metrics of Ensemble using all the best regression model: r2 score: 0.916496279971848 Root Mean Squared Error: 2171.0230155763215 Mean Squared Error: 4713340.934162105 Mean Absolute Error: 1498.9045533956994
all_regr_1 = ensembled_1 + [ ensemble_1 ]
for est in all_regr_1:
scores = cross_val_score(est, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(scores.mean()*-1, scores.std())
2940.8493374835293 21.710158160080447 2050.8184550823607 18.77372496404439 2131.7099792378726 20.89493035941793 2183.4531322456432 23.185409663380604
xt = X_test.head(30)
pred1 = lr_best_pipe.predict(xt)
pred2 = rfr_best_pipe.predict(xt)
pred3 = hgb_best_pipe.predict(xt)
pred4 = ensemble_1.predict(xt)
plt.figure(figsize=(10,6))
plt.plot(pred1, "gd", alpha=0.5, label="LR")
plt.plot(pred2, "b^", alpha=0.5, label="RF")
plt.plot(pred3, "ys", alpha=0.5, label="HGB")
plt.plot(pred4, "r*", alpha=0.5, ms=10, label="Ensemble")
plt.plot(y_test.to_numpy()[:30], "kx", alpha=0.5, label="True Data")
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.ylabel("predicted")
plt.xlabel("test samples")
plt.legend(loc="best")
plt.title("Regressor predictions and their average")
plt.show()
y_test
394603 8499
21953 8995
298625 1495
393545 6990
64172 5500
...
283918 1799
147403 14590
192116 15995
260837 11999
56775 22495
Name: price, Length: 83026, dtype: int64
pred1
array([ 9595.87644772, 6694.86894016, 2568.62738305, 9300.59230143,
4345.74798703, 10448.97372998, 8275.2039521 , 20946.5541587 ,
19224.38859444, 17039.23095179, 2949.64856691, 13435.97821469,
15313.89762734, 1821.36420027, 5362.99210056, 14687.49365451,
20043.85733162, 5894.21111351, 7370.05124752, 12786.19617837,
20058.30758017, 17230.24786671, 10302.14833548, 8460.49600769,
15086.40643211, 10608.33149661, 6458.13782295, 15134.38936476,
8164.93045539, 13681.80602701])
pred2
array([ 8964.63570327, 8790.45802693, 2263.14465056, 9203.10227574,
5077.39007556, 9035.85371654, 9174.65863815, 23650.0269488 ,
19410.9043044 , 19364.66135608, 3055.77798893, 13588.6476387 ,
13255.90131383, 1727.2634342 , 5638.93715852, 15611.29992085,
24751.16816757, 6359.07887005, 6851.39809668, 11004.40591999,
22908.7720577 , 17592.69052297, 12472.50905293, 10699.12797405,
14013.55094714, 10772.62358603, 6889.07390394, 11558.72496876,
7820.17319432, 17596.5219926 ])
pred3
array([ 8651.14274852, 8601.44286371, 2011.84164334, 9149.27308481,
4767.23303629, 9012.19280594, 9025.70776825, 24143.87482109,
21224.087829 , 17754.93964482, 3372.70843535, 13559.78073051,
15230.5963485 , 1947.06947168, 4998.31947323, 14985.56655319,
24351.43271795, 6323.84837746, 6709.37632125, 11872.76668793,
22112.67732738, 17575.79211348, 10962.1058767 , 9563.02819392,
13586.19384097, 10027.10216176, 6437.40707507, 13151.99173272,
8208.3744873 , 17672.7319512 ])
y_test.dtypes
dtype('int64')
y_test.head()
394603 8499 21953 8995 298625 1495 393545 6990 64172 5500 Name: price, dtype: int64
instance = X_test.head(30)
predictions = pd.DataFrame({
'Linear Regression': lr_best_pipe.predict(instance),
'Random Forest': rfr_best_pipe.predict(instance),
'Gradient Boosting': hgb_best_pipe.predict(instance),
'Ensemble': ensemble_1.predict(instance),
'Actual Values': y_test.to_numpy()[:30]
})
predictions
| Linear Regression | Random Forest | Gradient Boosting | Ensemble | Actual Values | |
|---|---|---|---|---|---|
| 0 | 9595.876448 | 8964.635703 | 8651.142749 | 9291.565377 | 8499 |
| 1 | 6694.868940 | 8790.458027 | 8601.442864 | 8133.398688 | 8995 |
| 2 | 2568.627383 | 2263.144651 | 2011.841643 | 2264.769085 | 1495 |
| 3 | 9300.592301 | 9203.102276 | 9149.273085 | 9144.965485 | 6990 |
| 4 | 4345.747987 | 5077.390076 | 4767.233036 | 4796.055852 | 5500 |
| 5 | 10448.973730 | 9035.853717 | 9012.192806 | 9587.348297 | 10000 |
| 6 | 8275.203952 | 9174.658638 | 9025.707768 | 8806.618878 | 8495 |
| 7 | 20946.554159 | 23650.026949 | 24143.874821 | 22880.715427 | 26980 |
| 8 | 19224.388594 | 19410.904304 | 21224.087829 | 20025.758605 | 19989 |
| 9 | 17039.230952 | 19364.661356 | 17754.939645 | 18017.562672 | 14990 |
| 10 | 2949.648567 | 3055.777989 | 3372.708435 | 3174.350700 | 2495 |
| 11 | 13435.978215 | 13588.647639 | 13559.780731 | 13573.551430 | 13200 |
| 12 | 15313.897627 | 13255.901314 | 15230.596349 | 14627.418146 | 14999 |
| 13 | 1821.364200 | 1727.263434 | 1947.069472 | 1862.721793 | 1740 |
| 14 | 5362.992101 | 5638.937159 | 4998.319473 | 5237.616322 | 4696 |
| 15 | 14687.493655 | 15611.299921 | 14985.566553 | 15072.667977 | 12990 |
| 16 | 20043.857332 | 24751.168168 | 24351.432718 | 23018.123148 | 22450 |
| 17 | 5894.211114 | 6359.078870 | 6323.848377 | 6202.078504 | 5195 |
| 18 | 7370.051248 | 6851.398097 | 6709.376321 | 6972.655900 | 5997 |
| 19 | 12786.196178 | 11004.405920 | 11872.766688 | 12037.297796 | 14391 |
| 20 | 20058.307580 | 22908.772058 | 22112.677327 | 21883.532158 | 22300 |
| 21 | 17230.247867 | 17592.690523 | 17575.792113 | 17648.892596 | 15950 |
| 22 | 10302.148335 | 12472.509053 | 10962.105877 | 11318.993793 | 8999 |
| 23 | 8460.496008 | 10699.127974 | 9563.028194 | 9446.419708 | 9350 |
| 24 | 15086.406432 | 14013.550947 | 13586.193841 | 14203.992529 | 12991 |
| 25 | 10608.331497 | 10772.623586 | 10027.102162 | 10621.520834 | 9400 |
| 26 | 6458.137823 | 6889.073904 | 6437.407075 | 6596.995932 | 5995 |
| 27 | 15134.389365 | 11558.724969 | 13151.991733 | 13224.071161 | 10450 |
| 28 | 8164.930455 | 7820.173194 | 8208.374487 | 8026.113500 | 8000 |
| 29 | 13681.806027 | 17596.521993 | 17672.731951 | 16329.088997 | 18490 |
ensembled_2 = [ rfr_best_pipe, hgb_best_pipe ]
for est in ensembled_2:
est.fit(X_train, y_train)
ensemble_2 = VotingRegressor(
[
#("lr", lr_best_pipe)
("rf", rfr_best_pipe),
("hbg", hgb_best_pipe)
]
)
ensemble_2.fit(X_train, y_train)
VotingRegressor(estimators=[('rf',
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
O...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6,
max_iter=150))]))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. VotingRegressor(estimators=[('rf',
Pipeline(steps=[('pp',
Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
O...
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])),
('regr',
HistGradientBoostingRegressor(max_depth=6,
max_iter=150))]))])Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
SelectKBest(k=22, score_func=<function f_regression at 0x7fc2c16e4af0>)
RandomForestRegressor(max_features='sqrt', min_samples_leaf=6,
min_samples_split=6)Pipeline(steps=[('preprocessor',
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sp...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler',
StandardScaler())]))]))])Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer())]),
['mileage',
'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossove...
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour',
'standard_make',
'standard_model',
'body_type', 'fuel_type',
'manufacturer_popularity',
'condition', 'usage'])],
verbose_feature_names_out=False)),
('poly_int',
Pipeline(steps=[('poly_int',
PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())]))])ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer())]),
['mileage', 'year_of_registration']),
('cat_1',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('ohe',
OneHotEncoder(drop='if_binary',
handle_unknown='ignore',
sparse_output=False))]),
['crossover_car_and_van']),
('cat_2',
Pipeli...
SimpleImputer(strategy='most_frequent')),
('targetencoder',
TargetEncoder(cols=['standard_colour',
'standard_make',
'standard_model',
'body_type',
'fuel_type',
'manufacturer_popularity',
'condition',
'usage']))]),
['standard_colour', 'standard_make',
'standard_model', 'body_type', 'fuel_type',
'manufacturer_popularity', 'condition',
'usage'])],
verbose_feature_names_out=False)['mileage', 'year_of_registration']
SimpleImputer()
['crossover_car_and_van']
SimpleImputer(strategy='most_frequent')
OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
['standard_colour', 'standard_make', 'standard_model', 'body_type', 'fuel_type', 'manufacturer_popularity', 'condition', 'usage']
SimpleImputer(strategy='most_frequent')
TargetEncoder(cols=['standard_colour', 'standard_make', 'standard_model',
'body_type', 'fuel_type', 'manufacturer_popularity',
'condition', 'usage'])[]
passthrough
Pipeline(steps=[('poly_int', PolynomialFeatures(include_bias=False)),
('scaler', StandardScaler())])PolynomialFeatures(include_bias=False)
StandardScaler()
HistGradientBoostingRegressor(max_depth=6, max_iter=150)
# Model performance evaluation
ens_2_r2_score = r2_score(y_test, ensemble_2.predict(X_test))
ens_2_rmse = rmse(y_test, ensemble_2.predict(X_test))
ens_2_mse = mean_squared_error(y_test, ensemble_2.predict(X_test))
ens_2_mae = mean_absolute_error(y_test, ensemble_2.predict(X_test))
# Print the evaluation metrics
print('Evaluation metrics of Ensemble using all the best regression model:')
print("r2 score: ", ens_2_r2_score)
print('Root Mean Squared Error:', ens_2_rmse)
print('Mean Squared Error:', ens_2_mse)
print('Mean Absolute Error:', ens_2_mae)
Evaluation metrics of Ensemble using all the best regression model: r2 score: 0.9283657892374405 Root Mean Squared Error: 2010.8133321010928 Mean Squared Error: 4043370.2565554995 Mean Absolute Error: 1368.974742609072
all_regr_2 = ensembled_2 + [ ensemble_2 ]
for est in all_regr_2:
scores = cross_val_score(est, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(scores.mean()*-1, scores.std())
2050.2515837990604 18.953650987025153 2130.943596376109 20.4898951629468 2023.6010866992474 20.851126124437265
xt = X_test.head(30)
pred5 = rfr_best_pipe.predict(xt)
pred6 = hgb_best_pipe.predict(xt)
pred7 = ensemble_2.predict(xt)
plt.figure(figsize=(10,6))
plt.plot(pred5, "b^", alpha=0.5, label="RF")
plt.plot(pred6, "ys", alpha=0.5, label="HGB")
plt.plot(pred7, "r*", alpha=0.5, ms=10, label="Ensemble")
plt.plot(y_test.to_numpy()[:30], "kx", alpha=0.5, label="True Data")
plt.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
plt.ylabel("predicted")
plt.xlabel("test samples")
plt.legend(loc="best")
plt.title("Regressor predictions and their average")
plt.show()
y_test.head()
394603 8499 21953 8995 298625 1495 393545 6990 64172 5500 Name: price, dtype: int64
pred5
array([ 9261.64522193, 8768.22966936, 2284.54621613, 9096.91112615,
5106.59648427, 8980.04043171, 9180.72237138, 23520.57740125,
19456.74369753, 19353.86426216, 3104.63928174, 13606.77275591,
13200.59222012, 1775.5377516 , 5632.56196034, 15535.40498506,
24756.90037814, 6383.76617897, 6842.53513804, 11012.22222936,
22894.05113014, 17873.15894367, 12729.5528335 , 10799.27396316,
13874.42065518, 10866.44219391, 6766.26083861, 11586.80111114,
7764.21322623, 17610.95569626])
pred6
array([ 8716.81165214, 8626.98846673, 2068.69304518, 9065.83014051,
4672.5749111 , 9282.01690586, 9143.55027799, 24477.53674593,
21134.59744743, 17756.07084133, 3365.33275328, 13857.33946067,
15587.78562361, 1745.66467127, 4863.41294572, 15007.70940414,
23934.36962434, 6300.76285311, 6669.60173996, 12092.53486177,
22677.41487288, 18686.86163311, 10915.90396421, 8970.93990176,
13634.47583549, 10406.89798513, 6416.38093442, 13123.61697632,
8212.62041538, 17675.17994091])
pred7
array([ 8994.9705064 , 8684.46754345, 2114.90355601, 9133.91763727,
4909.28738189, 9144.7610055 , 9083.84471971, 24160.1683277 ,
20270.46464596, 18446.1461364 , 3256.92340347, 13788.5223736 ,
14314.4677586 , 1722.512481 , 5333.375982 , 15141.92229463,
24640.48583416, 6358.65126569, 6780.33632087, 11369.58128472,
22409.04455486, 17688.81972858, 11672.94760417, 10129.38154459,
13882.48956005, 10567.88897206, 6792.18392341, 12433.72332441,
8045.38564673, 17764.03144236])
y_test.dtypes
dtype('int64')
y_test.head()
394603 8499 21953 8995 298625 1495 393545 6990 64172 5500 Name: price, dtype: int64
instance = X_test.head(30)
predictions = pd.DataFrame({
'Random Forest': rfr_best_pipe.predict(instance),
'Gradient Boosting': hgb_best_pipe.predict(instance),
'Ensemble': ensemble_2.predict(instance),
'Actual Values': y_test.to_numpy()[:30]
})
predictions
| Random Forest | Gradient Boosting | Ensemble | Actual Values | |
|---|---|---|---|---|
| 0 | 9261.645222 | 8716.811652 | 8994.970506 | 8499 |
| 1 | 8768.229669 | 8626.988467 | 8684.467543 | 8995 |
| 2 | 2284.546216 | 2068.693045 | 2114.903556 | 1495 |
| 3 | 9096.911126 | 9065.830141 | 9133.917637 | 6990 |
| 4 | 5106.596484 | 4672.574911 | 4909.287382 | 5500 |
| 5 | 8980.040432 | 9282.016906 | 9144.761005 | 10000 |
| 6 | 9180.722371 | 9143.550278 | 9083.844720 | 8495 |
| 7 | 23520.577401 | 24477.536746 | 24160.168328 | 26980 |
| 8 | 19456.743698 | 21134.597447 | 20270.464646 | 19989 |
| 9 | 19353.864262 | 17756.070841 | 18446.146136 | 14990 |
| 10 | 3104.639282 | 3365.332753 | 3256.923403 | 2495 |
| 11 | 13606.772756 | 13857.339461 | 13788.522374 | 13200 |
| 12 | 13200.592220 | 15587.785624 | 14314.467759 | 14999 |
| 13 | 1775.537752 | 1745.664671 | 1722.512481 | 1740 |
| 14 | 5632.561960 | 4863.412946 | 5333.375982 | 4696 |
| 15 | 15535.404985 | 15007.709404 | 15141.922295 | 12990 |
| 16 | 24756.900378 | 23934.369624 | 24640.485834 | 22450 |
| 17 | 6383.766179 | 6300.762853 | 6358.651266 | 5195 |
| 18 | 6842.535138 | 6669.601740 | 6780.336321 | 5997 |
| 19 | 11012.222229 | 12092.534862 | 11369.581285 | 14391 |
| 20 | 22894.051130 | 22677.414873 | 22409.044555 | 22300 |
| 21 | 17873.158944 | 18686.861633 | 17688.819729 | 15950 |
| 22 | 12729.552833 | 10915.903964 | 11672.947604 | 8999 |
| 23 | 10799.273963 | 8970.939902 | 10129.381545 | 9350 |
| 24 | 13874.420655 | 13634.475835 | 13882.489560 | 12991 |
| 25 | 10866.442194 | 10406.897985 | 10567.888972 | 9400 |
| 26 | 6766.260839 | 6416.380934 | 6792.183923 | 5995 |
| 27 | 11586.801111 | 13123.616976 | 12433.723324 | 10450 |
| 28 | 7764.213226 | 8212.620415 | 8045.385647 | 8000 |
| 29 | 17610.955696 | 17675.179941 | 17764.031442 | 18490 |
(e.g., evaluate selected model(s) according to popular score and loss metrics with cross-validation, analyse true vs predicted plot, gain and discuss insights based on feature importance and model output space).
lr_cv_scores = cross_val_score(
lr, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for linear rigression model(Ridge):")
print("Cross validation scores mean:", lr_cv_scores.mean())
print("Cross validation scores Standard deviation:", lr_cv_scores.std())
Cross validation mean and standard deviation scores for linear rigression model(Ridge): Cross validation scores mean: 2972.834437060577 Cross validation scores Standard deviation: 28.059082907206303
rfr_cv_scores = cross_val_score(
rfr_featsel, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for Random Forest Regressor model with Feature selection, k=22:")
print("Cross validation scores mean:", rfr_cv_scores.mean())
print("Cross validation scores Standard deviation:", rfr_cv_scores.std())
Cross validation mean and standard deviation scores for Random Forest Regressor model with Feature selection, k=22: Cross validation scores mean: 2056.074311032269 Cross validation scores Standard deviation: 18.673378623707407
hgb_cv_scores = cross_val_score(
hgb, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for HistGradient Boosting Regressor:")
print("Cross validation scores mean:", hgb_cv_scores.mean())
print("Cross validation scores Standard deviation:", hgb_cv_scores.std())
Cross validation mean and standard deviation scores for HistGradient Boosting Regressor: Cross validation scores mean: 2169.877685657632 Cross validation scores Standard deviation: 22.836271018669795
lr_grid_cv_scores = cross_val_score(
lr_best_pipe, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for linear rigression model(Ridge) gridsearch result best_pipe:")
print("Cross validation scores mean:", lr_grid_cv_scores.mean())
print("Cross validation scores Standard deviation:", lr_grid_cv_scores.std())
Cross validation mean and standard deviation scores for linear rigression model(Ridge) gridsearch result best_pipe: Cross validation scores mean: 2936.669703015738 Cross validation scores Standard deviation: 28.173402779459753
rfr_grid_cv_scores = cross_val_score(
rfr_best_pipe, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for Random Forest Regressor model with Feature selection, k=22 gridsearch result best_pipe:")
print("Cross validation scores mean:", rfr_grid_cv_scores.mean())
print("Cross validation scores Standard deviation:", rfr_grid_cv_scores.std())
Cross validation mean and standard deviation scores for Random Forest Regressor model with Feature selection, k=22 gridsearch result best_pipe: Cross validation scores mean: 2038.7102796533375 Cross validation scores Standard deviation: 24.628017987846338
hgb_grid_cv_scores = cross_val_score(
hgb_best_pipe, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error'
)*-1
print("Cross validation mean and standard deviation scores for HistGradient Boosting Regressor gridsearch result best_pipe:")
print("Cross validation scores mean:", hgb_grid_cv_scores.mean())
print("Cross validation scores Standard deviation:", hgb_grid_cv_scores.std())
Cross validation mean and standard deviation scores for HistGradient Boosting Regressor gridsearch result best_pipe: Cross validation scores mean: 2129.496257282318 Cross validation scores Standard deviation: 21.56285339008721
all_regr_1 = ensembled_1 + [ ensemble_1 ]
for idx, est in enumerate(all_regr_1):
ens_cv_scores = cross_val_score(est, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
mean_score = -ens_cv_scores.mean()
std_score = ens_cv_scores.std()
if idx < 3:
est_name = est.named_steps['regr'].__class__.__name__
else:
est_name = "ensemble_1"
print(f"Model: {est_name}")
print(f"Mean Score: {mean_score}")
print(f"Standard Deviation: {std_score}")
print('---------------------------------------------------')
Model: Ridge Mean Score: 2936.669703015738 Standard Deviation: 28.173402779459753 --------------------------------------------------- Model: RandomForestRegressor Mean Score: 2038.941406697786 Standard Deviation: 23.748899390526514 --------------------------------------------------- Model: HistGradientBoostingRegressor Mean Score: 2126.6647146970304 Standard Deviation: 24.85011806713759 --------------------------------------------------- Model: ensemble_1 Mean Score: 2176.559754153592 Standard Deviation: 27.288511320646187 ---------------------------------------------------
all_regr_2 = ensembled_2 + [ ensemble_2 ]
for idx, est in enumerate(all_regr_2):
ens_cv_scores2 = cross_val_score(est, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error')
mean_score = -ens_cv_scores2.mean()
std_score = ens_cv_scores2.std()
if idx < 2:
est_name = est.named_steps['regr'].__class__.__name__
else:
est_name = "ensemble_2"
print(f"Model: {est_name}")
print(f"Mean Score: {mean_score}")
print(f"Standard Deviation: {std_score}")
print('---------------------------------------------------')
Model: RandomForestRegressor Mean Score: 2038.8227906208479 Standard Deviation: 25.712296623855448 --------------------------------------------------- Model: HistGradientBoostingRegressor Mean Score: 2124.941665448982 Standard Deviation: 25.060687578927663 --------------------------------------------------- Model: ensemble_2 Mean Score: 2013.398754200986 Standard Deviation: 25.044121260805998 ---------------------------------------------------
def plot_true_vs_predicted(
est,
X_train, y_train,
X_test, y_test,
ax=None,
train_style_kws={},
test_style_kws={}
):
if ax is None:
fig, ax = plt.subplots()
y_pred_train = est.predict(X_train)
y_pred_test = est.predict(X_test)
ax.plot(y_train, y_pred_train, '.', label='train', **train_style_kws)
ax.plot(y_test, y_pred_test, '.', label='test', **test_style_kws)
ax.set_xlabel('True Target')
ax.set_ylabel('Predicted Target')
# the diagnonal line for the idealised space of predictions
ax.plot(
[0, 1], [0, 1], transform=ax.transAxes,
color='green', linestyle=':', alpha=0.7
)
ax.legend()
return ax
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(7,7), constrained_layout=True)
plot_true_vs_predicted(
rfr_best_pipe,
X_train, y_train,
X_test, y_test,
ax=ax
);
fig, ax = plt.subplots(figsize=(7,7), constrained_layout=True)
plot_true_vs_predicted(
ensemble_2,
X_train, y_train,
X_test, y_test,
ax=ax
);
I employed the SHAP (SHapley Additive exPlanations) approach to interpret the model's predictions and understand the impact of different features on the predicted car prices. Through the examination of individual feature contributions, we gained insights into how specific attributes influence the predicted prices. Understanding the factors that drive car prices is essential for accurate predictions. In this section, I investigated the contributions of various features in predicting car prices using SHAP values.
X_test.shape
(83026, 11)
1% of the X_test
X_test_copy = X_test.copy()
X_test_1_percent = X_test_copy.head(830)
X_pp = preprocessor.transform(X_test_1_percent)
X_pp.head()
| mileage | year_of_registration | crossover_car_and_van_True | standard_colour | standard_make | standard_model | body_type | fuel_type | manufacturer_popularity | condition | ... | fuel_type^2 | fuel_type manufacturer_popularity | fuel_type condition | fuel_type usage | manufacturer_popularity^2 | manufacturer_popularity condition | manufacturer_popularity usage | condition^2 | condition usage | usage^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | -0.555164 | -0.141365 | -0.058518 | -1.502321 | -0.106466 | -0.889449 | 0.435014 | 0.616082 | -1.013979 | -0.789974 | ... | 0.497653 | -0.134113 | -0.555804 | 1.252535 | -1.023893 | -0.960854 | 0.452557 | -0.836446 | -0.285895 | 0.989014 |
| 21953 | 0.992890 | -0.442900 | -0.058518 | 0.457424 | -0.507118 | -0.548045 | -1.005366 | 2.228651 | -1.013979 | -0.789974 | ... | 2.191976 | 1.018316 | -0.128871 | -0.734258 | -1.023893 | -0.960854 | -1.654888 | -0.836446 | -1.137148 | -1.420925 |
| 298625 | 1.910689 | -1.950575 | -0.058518 | -0.052888 | -1.305318 | -1.203763 | -1.005366 | -0.772114 | 0.952895 | -1.775883 | ... | -0.698201 | -0.143011 | -1.716005 | -1.586252 | 0.958826 | -1.611771 | -1.241968 | -1.359516 | -1.500128 | -1.420925 |
| 393545 | 1.110557 | -0.442900 | -0.058518 | 0.542288 | 0.075017 | 0.191731 | -1.005366 | 0.616082 | 0.952895 | -0.789974 | ... | 0.497653 | 1.071524 | -0.555804 | -1.192107 | 0.958826 | -0.569739 | -1.241968 | -0.836446 | -1.137148 | -1.420925 |
| 64172 | -0.272865 | -1.045970 | -0.058518 | 0.457424 | -1.316635 | -1.477082 | -1.005366 | 0.616082 | -1.013979 | -0.789974 | ... | 0.497653 | -0.134113 | -0.555804 | -0.000879 | -1.023893 | -0.960854 | -0.627970 | -0.836446 | -0.722348 | -0.465741 |
5 rows × 77 columns
X100 = shap.utils.sample(X_pp, 100) # 100 instances for use as the background distribution
X_pp_sel = selector.transform(X_pp)
X_pp_sel.shape
(830, 22)
X_pp_sel.columns
Index(['year_of_registration', 'standard_model', 'condition',
'year_of_registration^2', 'year_of_registration standard_model',
'year_of_registration condition', 'standard_colour standard_model',
'standard_colour condition', 'standard_make standard_model',
'standard_make condition', 'standard_make usage', 'standard_model^2',
'standard_model body_type', 'standard_model fuel_type',
'standard_model manufacturer_popularity', 'standard_model condition',
'standard_model usage', 'body_type condition', 'body_type usage',
'fuel_type condition', 'manufacturer_popularity condition',
'condition^2'],
dtype='object')
X_pp_sel.head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | -0.141365 | -0.889449 | -0.789974 | -0.142278 | -0.889491 | -0.788924 | -1.057276 | -1.025546 | -0.670628 | -0.657210 | ... | -0.591365 | -0.674072 | -0.982307 | -0.941052 | -0.407400 | -0.469263 | 1.060020 | -0.555804 | -0.960854 | -0.836446 |
| 21953 | -0.442900 | -0.548045 | -0.789974 | -0.443894 | -0.549341 | -0.789923 | -0.453202 | -0.671850 | -0.607944 | -0.843443 | ... | -0.775905 | 0.006086 | -0.702098 | -0.774821 | -1.068909 | -1.015130 | -1.506614 | -0.128871 | -0.960854 | -0.836446 |
| 298625 | -1.950575 | -1.203763 | -1.775883 | -1.949730 | -1.207122 | -1.773891 | -1.154144 | -1.689149 | -1.101364 | -1.680570 | ... | -1.115533 | -1.167468 | -1.016512 | -1.382991 | -1.376588 | -1.599357 | -1.506614 | -1.716005 | -1.611771 | -1.359516 |
| 393545 | -0.442900 | 0.191731 | -0.789974 | -0.443894 | 0.189362 | -0.789923 | 0.294842 | -0.656533 | 0.022576 | -0.572852 | ... | -0.392739 | 0.316255 | 0.385674 | -0.414622 | -0.721789 | -1.015130 | -1.506614 | -0.555804 | -0.569739 | -0.836446 |
| 64172 | -1.045970 | -1.477082 | -0.789974 | -1.046677 | -1.477964 | -0.791920 | -1.367749 | -0.671850 | -1.200748 | -1.219728 | ... | -1.257098 | -1.212325 | -1.464610 | -1.227172 | -1.258197 | -1.015130 | -0.832163 | -0.555804 | -0.960854 | -0.836446 |
5 rows × 22 columns
X100_sel = shap.utils.sample(X_pp_sel, 100)
rfr_explainer = shap.Explainer(rfr_best_pipe['regr'].predict, X100_sel)
rfr_shap_values = rfr_explainer(X_pp_sel)
Permutation explainer: 831it [08:39, 1.58it/s]
X_test.head()
| mileage | standard_colour | standard_make | standard_model | year_of_registration | body_type | crossover_car_and_van | fuel_type | condition | usage | manufacturer_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | 22453.0 | Silver | SKODA | Fabia | 2015 | Estate | False | Diesel | OLD | LOW | Medium |
| 21953 | 68500.0 | White | Toyota | Auris | 2014 | Hatchback | False | Petrol Hybrid | OLD | HIGH | Medium |
| 298625 | 95800.0 | Blue | Vauxhall | Corsa | 2009 | Hatchback | False | Petrol | VERY OLD | HIGH | Very High |
| 393545 | 72000.0 | Black | Volkswagen | Golf | 2014 | Hatchback | False | Diesel | OLD | HIGH | Very High |
| 64172 | 30850.0 | White | Citroen | DS3 | 2012 | Hatchback | False | Diesel | OLD | AVERAGE | Medium |
X_pp_sel.head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | -0.141365 | -0.889449 | -0.789974 | -0.142278 | -0.889491 | -0.788924 | -1.057276 | -1.025546 | -0.670628 | -0.657210 | ... | -0.591365 | -0.674072 | -0.982307 | -0.941052 | -0.407400 | -0.469263 | 1.060020 | -0.555804 | -0.960854 | -0.836446 |
| 21953 | -0.442900 | -0.548045 | -0.789974 | -0.443894 | -0.549341 | -0.789923 | -0.453202 | -0.671850 | -0.607944 | -0.843443 | ... | -0.775905 | 0.006086 | -0.702098 | -0.774821 | -1.068909 | -1.015130 | -1.506614 | -0.128871 | -0.960854 | -0.836446 |
| 298625 | -1.950575 | -1.203763 | -1.775883 | -1.949730 | -1.207122 | -1.773891 | -1.154144 | -1.689149 | -1.101364 | -1.680570 | ... | -1.115533 | -1.167468 | -1.016512 | -1.382991 | -1.376588 | -1.599357 | -1.506614 | -1.716005 | -1.611771 | -1.359516 |
| 393545 | -0.442900 | 0.191731 | -0.789974 | -0.443894 | 0.189362 | -0.789923 | 0.294842 | -0.656533 | 0.022576 | -0.572852 | ... | -0.392739 | 0.316255 | 0.385674 | -0.414622 | -0.721789 | -1.015130 | -1.506614 | -0.555804 | -0.569739 | -0.836446 |
| 64172 | -1.045970 | -1.477082 | -0.789974 | -1.046677 | -1.477964 | -0.791920 | -1.367749 | -0.671850 | -1.200748 | -1.219728 | ... | -1.257098 | -1.212325 | -1.464610 | -1.227172 | -1.258197 | -1.015130 | -0.832163 | -0.555804 | -0.960854 | -0.836446 |
5 rows × 22 columns
pd.DataFrame(rfr_shap_values.values, columns=X_pp_sel.columns).head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -10.344964 | -297.581203 | -34.190112 | 16.981662 | -510.673763 | 45.530051 | -256.997969 | -37.146414 | -518.549378 | -530.183375 | ... | 7.480376 | -198.687333 | -273.856309 | -1183.843955 | -553.999053 | -30.674258 | 441.118705 | -96.112909 | -113.361304 | -38.124900 |
| 1 | -82.561067 | -128.788591 | -35.845637 | -68.468621 | -271.875836 | -49.293666 | -140.226800 | -15.155089 | -200.130476 | -481.393563 | ... | -72.686357 | 302.012376 | -198.109073 | -798.041915 | -1128.233172 | -247.272965 | -380.954952 | 172.821658 | -87.012731 | -33.128750 |
| 2 | -515.443394 | -325.460728 | -91.613024 | -494.880945 | -585.604297 | -545.843153 | -230.351089 | -105.298608 | -409.541156 | -910.276270 | ... | -334.150151 | -170.481570 | -297.350440 | -1836.956888 | -1568.213018 | -662.204298 | -545.866882 | -137.496045 | -143.664900 | -122.650338 |
| 3 | -101.101619 | 21.671824 | -60.424754 | -91.897134 | -190.485247 | -80.910631 | 16.662304 | -36.602073 | -159.258969 | -487.718081 | ... | 9.327643 | -114.464405 | 63.958878 | -743.622285 | -955.699638 | -231.123185 | -340.657804 | -117.443310 | -27.351264 | -43.780092 |
| 4 | -382.167144 | -249.179214 | -51.750048 | -379.131327 | -435.694261 | -427.440676 | -230.009925 | -38.103202 | -403.377505 | -657.787294 | ... | -283.355507 | -178.194387 | -202.108030 | -1555.117476 | -1233.146106 | -371.133485 | -251.834515 | -79.579213 | -90.856420 | -60.309481 |
5 rows × 22 columns
y_test.mean()
12963.855611495193
y_test.head(5)
394603 8499 21953 8995 298625 1495 393545 6990 64172 5500 Name: price, dtype: int64
y_test.iloc[1]
8995
X_pp_sel.head(3)
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | -0.141365 | -0.889449 | -0.789974 | -0.142278 | -0.889491 | -0.788924 | -1.057276 | -1.025546 | -0.670628 | -0.657210 | ... | -0.591365 | -0.674072 | -0.982307 | -0.941052 | -0.407400 | -0.469263 | 1.060020 | -0.555804 | -0.960854 | -0.836446 |
| 21953 | -0.442900 | -0.548045 | -0.789974 | -0.443894 | -0.549341 | -0.789923 | -0.453202 | -0.671850 | -0.607944 | -0.843443 | ... | -0.775905 | 0.006086 | -0.702098 | -0.774821 | -1.068909 | -1.015130 | -1.506614 | -0.128871 | -0.960854 | -0.836446 |
| 298625 | -1.950575 | -1.203763 | -1.775883 | -1.949730 | -1.207122 | -1.773891 | -1.154144 | -1.689149 | -1.101364 | -1.680570 | ... | -1.115533 | -1.167468 | -1.016512 | -1.382991 | -1.376588 | -1.599357 | -1.506614 | -1.716005 | -1.611771 | -1.359516 |
3 rows × 22 columns
shap.plots.waterfall(rfr_shap_values[1])
• standard_model usage: This feature demonstrates a negative plot magnitude of -1128.23. A decrease in the usage of the standard model contributes to a decrease in the prediction.
• standard_model condition: With a negative plot magnitude of -798.04, a decrease in the condition of the standard model contributes to a decrease in the prediction.
• standard_make usage: Similarly, a negative plot magnitude of -721.02 suggests that a decrease in the usage of the standard make results in a decrease in the prediction.
• standard_make condition: This feature exhibits a negative plot magnitude of -481.39. A decrease in the condition of the standard make contributes to a decrease in the prediction.
• Body_type usage: With a negative plot magnitude of -380.95, a decrease in the usage of the body type leads to a decrease in the prediction.
• standard_model fuel_type: In contrast to the previous features, this feature displays a positive plot magnitude of +302.01. A change in the fuel type of the standard model contributes to an increase in the prediction.
• year_of_registration standard_model: This feature demonstrates a negative plot magnitude of -271.88. A decrease in the year of registration for the standard model contributes to a decrease in the prediction.
• Body_type condition: With a negative plot magnitude of -247.27, a decrease in the condition of the body type results in a decrease in the prediction.
• standard_make standard model: This feature exhibits a negative plot magnitude of -200.13. A change in the standard make for the standard model contributes to a decrease in the prediction.
from sklearn.inspection import PartialDependenceDisplay
X_test_pp = preprocessor.transform(X_test_copy)
X_test_sel = selector.transform(X_test_pp)
X_test_pp.shape
(83026, 77)
X_test_sel.shape
(83026, 22)
X_test_sel.head()
| year_of_registration | standard_model | condition | year_of_registration^2 | year_of_registration standard_model | year_of_registration condition | standard_colour standard_model | standard_colour condition | standard_make standard_model | standard_make condition | ... | standard_model body_type | standard_model fuel_type | standard_model manufacturer_popularity | standard_model condition | standard_model usage | body_type condition | body_type usage | fuel_type condition | manufacturer_popularity condition | condition^2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 394603 | -0.141365 | -0.889449 | -0.789974 | -0.142278 | -0.889491 | -0.788924 | -1.057276 | -1.025546 | -0.670628 | -0.657210 | ... | -0.591365 | -0.674072 | -0.982307 | -0.941052 | -0.407400 | -0.469263 | 1.060020 | -0.555804 | -0.960854 | -0.836446 |
| 21953 | -0.442900 | -0.548045 | -0.789974 | -0.443894 | -0.549341 | -0.789923 | -0.453202 | -0.671850 | -0.607944 | -0.843443 | ... | -0.775905 | 0.006086 | -0.702098 | -0.774821 | -1.068909 | -1.015130 | -1.506614 | -0.128871 | -0.960854 | -0.836446 |
| 298625 | -1.950575 | -1.203763 | -1.775883 | -1.949730 | -1.207122 | -1.773891 | -1.154144 | -1.689149 | -1.101364 | -1.680570 | ... | -1.115533 | -1.167468 | -1.016512 | -1.382991 | -1.376588 | -1.599357 | -1.506614 | -1.716005 | -1.611771 | -1.359516 |
| 393545 | -0.442900 | 0.191731 | -0.789974 | -0.443894 | 0.189362 | -0.789923 | 0.294842 | -0.656533 | 0.022576 | -0.572852 | ... | -0.392739 | 0.316255 | 0.385674 | -0.414622 | -0.721789 | -1.015130 | -1.506614 | -0.555804 | -0.569739 | -0.836446 |
| 64172 | -1.045970 | -1.477082 | -0.789974 | -1.046677 | -1.477964 | -0.791920 | -1.367749 | -0.671850 | -1.200748 | -1.219728 | ... | -1.257098 | -1.212325 | -1.464610 | -1.227172 | -1.258197 | -1.015130 | -0.832163 | -0.555804 | -0.960854 | -0.836446 |
5 rows × 22 columns
%config InlineBackend.figure_format = 'retina'
# https://seaborn.pydata.org/tutorial/aesthetics.html
sns.set(
style='ticks',
context='talk',
font_scale=0.8,
rc={'figure.figsize': (8,6)}
)
Random-forest Regressor
PartialDependenceDisplay.from_estimator(
rfr_best_pipe.named_steps['regr'], X_test_sel, features=['standard_model condition'], kind='both'
)
<sklearn.inspection._plot.partial_dependence.PartialDependenceDisplay at 0x7fc2986163d0>
features = ['standard_model condition',
'standard_model usage',
'standard_make condition',
'standard_make usage']
fig, ax = plt.subplots(figsize=(8,6), constrained_layout=True)
PartialDependenceDisplay.from_estimator(
rfr_best_pipe.named_steps['regr'], X_test_sel, features=features,
kind='both',
subsample=100, grid_resolution=30, n_jobs=2, random_state=0,
ax=ax, n_cols=2
);
fig, ax = plt.subplots(figsize=(8,6), constrained_layout=True)
PartialDependenceDisplay.from_estimator(
rfr_best_pipe.named_steps['regr'], X_test_sel, features=features,
kind='both', centered=True,
subsample=100, grid_resolution=30, n_jobs=2, random_state=0,
ax=ax, n_cols=2
);
fig, ax = plt.subplots(figsize=(8,6), constrained_layout=True)
PartialDependenceDisplay.from_estimator(
rfr_best_pipe.named_steps['regr'], X_test_sel, features=features,
kind='average',
subsample=100, grid_resolution=30, n_jobs=2, random_state=0,
ax=ax, n_cols=2
);